From 0299c8b1eaa19916cc27e577a1453db7fb5e434d Mon Sep 17 00:00:00 2001
From: ferrreo <harderthanfire@gmail.com>
Date: Wed, 3 Apr 2024 17:43:13 +0100
Subject: [PATCH] 6.8.3

---
 VERSION                                     |     2 +-
 config                                      |    14 +-
 patches/cachyos/0001-bore-cachy.patch       |   278 +-
 patches/cachyos/0001-cachyos-base-all.patch | 11724 ++++++++++++++++--
 patches/series                              |     2 -
 scripts/config.sh                           |     5 +-
 6 files changed, 10766 insertions(+), 1259 deletions(-)

diff --git a/VERSION b/VERSION
index 5f6c086..021c940 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-6.8.1
+6.8.3
diff --git a/config b/config
index 53b954e..489c008 100644
--- a/config
+++ b/config
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/x86 6.8.1 Kernel Configuration
+# Linux/x86 6.8.3 Kernel Configuration
 #
 CONFIG_CC_VERSION_TEXT="gcc (GCC) 13.2.1 20230801"
 CONFIG_CC_IS_GCC=y
@@ -24,6 +24,7 @@ CONFIG_PAHOLE_VERSION=126
 CONFIG_IRQ_WORK=y
 CONFIG_BUILDTIME_TABLE_SORT=y
 CONFIG_THREAD_INFO_IN_TASK=y
+# CONFIG_ECHO_SCHED is not set
 
 #
 # General setup
@@ -518,7 +519,6 @@ CONFIG_X86_DIRECT_GBPAGES=y
 CONFIG_X86_CPA_STATISTICS=y
 CONFIG_X86_MEM_ENCRYPT=y
 CONFIG_AMD_MEM_ENCRYPT=y
-# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set
 CONFIG_NUMA=y
 CONFIG_AMD_NUMA=y
 CONFIG_X86_64_ACPI_NUMA=y
@@ -561,6 +561,7 @@ CONFIG_HZ_300=y
 # CONFIG_HZ_500 is not set
 # CONFIG_HZ_600 is not set
 # CONFIG_HZ_750 is not set
+# CONFIG_HZ_625 is not set
 # CONFIG_HZ_1000 is not set
 CONFIG_HZ=300
 CONFIG_SCHED_HRTICK=y
@@ -836,6 +837,8 @@ CONFIG_AS_SHA1_NI=y
 CONFIG_AS_SHA256_NI=y
 CONFIG_AS_TPAUSE=y
 CONFIG_AS_GFNI=y
+CONFIG_AS_VAES=y
+CONFIG_AS_VPCLMULQDQ=y
 CONFIG_AS_WRUSS=y
 
 #
@@ -1024,6 +1027,7 @@ CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_MODULE_SIG=y
 # CONFIG_MODULE_SIG_FORCE is not set
 CONFIG_MODULE_SIG_ALL=y
+# CONFIG_MODULE_SIG_SHA1 is not set
 # CONFIG_MODULE_SIG_SHA256 is not set
 # CONFIG_MODULE_SIG_SHA384 is not set
 CONFIG_MODULE_SIG_SHA512=y
@@ -2053,7 +2057,6 @@ CONFIG_BT_BNEP_MC_FILTER=y
 CONFIG_BT_BNEP_PROTO_FILTER=y
 CONFIG_BT_CMTP=m
 CONFIG_BT_HIDP=m
-# CONFIG_BT_HS is not set
 CONFIG_BT_LE=y
 CONFIG_BT_LE_L2CAP_ECRED=y
 CONFIG_BT_6LOWPAN=m
@@ -6867,6 +6870,7 @@ CONFIG_DRM_AMD_DC=y
 CONFIG_DRM_AMD_DC_FP=y
 CONFIG_DRM_AMD_DC_SI=y
 CONFIG_DRM_AMD_SECURE_DISPLAY=y
+CONFIG_AMD_PRIVATE_COLOR=y
 # end of Display Engine Configuration
 
 CONFIG_HSA_AMD=y
@@ -10944,7 +10948,8 @@ CONFIG_ASYNC_XOR=m
 CONFIG_ASYNC_PQ=m
 CONFIG_ASYNC_RAID6_RECOV=m
 CONFIG_CRYPTO=y
-
+CONFIG_NTSYNC=m
+CONFIG_ACPI_CALL=m
 #
 # Crypto core or helper
 #
@@ -11808,6 +11813,7 @@ CONFIG_ASYNC_RAID6_TEST=m
 # CONFIG_TEST_OBJPOOL is not set
 CONFIG_ARCH_USE_MEMTEST=y
 CONFIG_MEMTEST=y
+
 # CONFIG_HYPERV_TESTING is not set
 # end of Kernel Testing and Coverage
 
diff --git a/patches/cachyos/0001-bore-cachy.patch b/patches/cachyos/0001-bore-cachy.patch
index 8aeeab5..cf2571b 100644
--- a/patches/cachyos/0001-bore-cachy.patch
+++ b/patches/cachyos/0001-bore-cachy.patch
@@ -1,24 +1,24 @@
-From 1ab81cfa061f454316364a32761ce45a7479e616 Mon Sep 17 00:00:00 2001
+From 37fd243d8f075b558f54a36fc85887269310709c Mon Sep 17 00:00:00 2001
 From: Piotr Gorski <lucjan.lucjanov@gmail.com>
-Date: Thu, 7 Mar 2024 22:28:47 +0100
+Date: Tue, 26 Mar 2024 08:11:18 +0100
 Subject: [PATCH] bore-cachy
 
 Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
 ---
- include/linux/sched.h   |  12 ++
- init/Kconfig            |  19 +++
- kernel/sched/core.c     | 148 +++++++++++++++++++
- kernel/sched/debug.c    |  61 +++++++-
- kernel/sched/fair.c     | 319 ++++++++++++++++++++++++++++++++++++----
+ include/linux/sched.h   |  10 ++
+ init/Kconfig            |  17 +++
+ kernel/sched/core.c     | 144 +++++++++++++++++++++++++
+ kernel/sched/debug.c    |  60 ++++++++++-
+ kernel/sched/fair.c     | 231 +++++++++++++++++++++++++++++++++++++---
  kernel/sched/features.h |   4 +
- kernel/sched/sched.h    |   7 +
- 7 files changed, 542 insertions(+), 28 deletions(-)
+ kernel/sched/sched.h    |   7 ++
+ 7 files changed, 457 insertions(+), 16 deletions(-)
 
 diff --git a/include/linux/sched.h b/include/linux/sched.h
-index ffe8f618a..7ac6163f9 100644
+index ffe8f618a..0ab0b0424 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
-@@ -547,6 +547,18 @@ struct sched_entity {
+@@ -547,6 +547,16 @@ struct sched_entity {
  	u64				sum_exec_runtime;
  	u64				prev_sum_exec_runtime;
  	u64				vruntime;
@@ -28,8 +28,6 @@ index ffe8f618a..7ac6163f9 100644
 +	u8				curr_burst_penalty;
 +	u8				burst_penalty;
 +	u8				burst_score;
-+	u32				burst_load;
-+	bool			on_cfs_rq;
 +	u8				child_burst;
 +	u32				child_burst_cnt;
 +	u64				child_burst_last_cached;
@@ -38,10 +36,10 @@ index ffe8f618a..7ac6163f9 100644
  	u64				slice;
  
 diff --git a/init/Kconfig b/init/Kconfig
-index 47671886d..c99132cf6 100644
+index 9ea39297f..f9bb5401f 100644
 --- a/init/Kconfig
 +++ b/init/Kconfig
-@@ -1299,6 +1299,25 @@ config CHECKPOINT_RESTORE
+@@ -1299,6 +1299,23 @@ config CHECKPOINT_RESTORE
  
  	  If unsure, say N here.
  
@@ -60,18 +58,16 @@ index 47671886d..c99132cf6 100644
 +	  With a little impact to scheduling fairness, it may improve
 +	  responsiveness especially under heavy background workload.
 +
-+	  You can turn it off by setting the sysctl kernel.sched_bore = 0.
-+
 +	  If unsure, say Y here.
 +
  config SCHED_AUTOGROUP
  	bool "Automatic process group scheduling"
  	select CGROUPS
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 9116bcc90..43e4311db 100644
+index 9116bcc90..fc3d7b48e 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
-@@ -4507,6 +4507,143 @@ int wake_up_state(struct task_struct *p, unsigned int state)
+@@ -4507,6 +4507,139 @@ int wake_up_state(struct task_struct *p, unsigned int state)
  	return try_to_wake_up(p, state, 0);
  }
  
@@ -86,18 +82,14 @@ index 9116bcc90..43e4311db 100644
 +	init_task.se.curr_burst_penalty = 0;
 +	init_task.se.burst_penalty = 0;
 +	init_task.se.burst_score = 0;
-+	init_task.se.on_cfs_rq = false;
 +	init_task.se.child_burst_last_cached = 0;
-+	init_task.se.burst_load = 0;
 +}
 +
 +void inline sched_fork_bore(struct task_struct *p) {
 +	p->se.burst_time = 0;
 +	p->se.curr_burst_penalty = 0;
 +	p->se.burst_score = 0;
-+	p->se.on_cfs_rq = false;
 +	p->se.child_burst_last_cached = 0;
-+	p->se.burst_load = 0;
 +}
 +
 +static u32 count_child_tasks(struct task_struct *p) {
@@ -206,7 +198,7 @@ index 9116bcc90..43e4311db 100644
 +}
 +
 +static void sched_post_fork_bore(struct task_struct *p) {
-+	if (p->sched_class == &fair_sched_class && likely(sched_bore))
++	if (p->sched_class == &fair_sched_class)
 +		inherit_burst(p);
 +	p->se.burst_penalty = p->se.prev_burst_penalty;
 +}
@@ -215,7 +207,7 @@ index 9116bcc90..43e4311db 100644
  /*
   * Perform scheduler related setup for a newly forked process p.
   * p is forked by current.
-@@ -4523,6 +4660,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4523,6 +4656,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
  	p->se.prev_sum_exec_runtime	= 0;
  	p->se.nr_migrations		= 0;
  	p->se.vruntime			= 0;
@@ -225,7 +217,7 @@ index 9116bcc90..43e4311db 100644
  	p->se.vlag			= 0;
  	p->se.slice			= sysctl_sched_base_slice;
  	INIT_LIST_HEAD(&p->se.group_node);
-@@ -4839,6 +4979,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+@@ -4839,6 +4975,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
  
  void sched_post_fork(struct task_struct *p)
  {
@@ -235,20 +227,20 @@ index 9116bcc90..43e4311db 100644
  	uclamp_post_fork(p);
  }
  
-@@ -9910,6 +10053,11 @@ void __init sched_init(void)
+@@ -9910,6 +10049,11 @@ void __init sched_init(void)
  	BUG_ON(&dl_sched_class != &stop_sched_class + 1);
  #endif
  
 +#ifdef CONFIG_SCHED_BORE
 +	sched_init_bore();
-+	printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 4.5.2 by Masahito Suzuki");
++	printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 5.0.3 by Masahito Suzuki");
 +#endif // CONFIG_SCHED_BORE
 +
  	wait_bit_init();
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index 8d5d98a58..a565363fd 100644
+index 8d5d98a58..b17861261 100644
 --- a/kernel/sched/debug.c
 +++ b/kernel/sched/debug.c
 @@ -167,7 +167,52 @@ static const struct file_operations sched_feat_fops = {
@@ -344,19 +336,18 @@ index 8d5d98a58..a565363fd 100644
  #ifdef CONFIG_NUMA_BALANCING
  	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
  #endif
-@@ -1068,6 +1123,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+@@ -1068,6 +1123,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
  
  	P(se.load.weight);
  #ifdef CONFIG_SMP
 +#ifdef CONFIG_SCHED_BORE
-+	P(se.burst_load);
 +	P(se.burst_score);
 +#endif // CONFIG_SCHED_BORE
  	P(se.avg.load_sum);
  	P(se.avg.runnable_sum);
  	P(se.avg.util_sum);
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index fc0a9de42..3ee4e7e70 100644
+index fc0a9de42..ae55f46a8 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
 @@ -19,6 +19,9 @@
@@ -369,7 +360,7 @@ index fc0a9de42..3ee4e7e70 100644
   */
  #include <linux/energy_model.h>
  #include <linux/mmap_lock.h>
-@@ -64,28 +67,128 @@
+@@ -64,28 +67,125 @@
   *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
   *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
   *
@@ -412,14 +403,12 @@ index fc0a9de42..3ee4e7e70 100644
 +
 +#ifdef CONFIG_SCHED_BORE
 +u8   __read_mostly sched_bore                   = 1;
-+u8   __read_mostly sched_burst_score_rounding   = 0;
 +u8   __read_mostly sched_burst_smoothness_long  = 1;
 +u8   __read_mostly sched_burst_smoothness_short = 0;
 +u8   __read_mostly sched_burst_fork_atavistic   = 2;
 +u8   __read_mostly sched_burst_penalty_offset   = 22;
 +uint __read_mostly sched_burst_penalty_scale    = 1280;
 +uint __read_mostly sched_burst_cache_lifetime   = 60000000;
-+u8   __read_mostly sched_vlag_deviation_limit   = 11;
 +static int __maybe_unused thirty_two     = 32;
 +static int __maybe_unused sixty_four     = 64;
 +static int __maybe_unused maxval_12_bits = 4095;
@@ -456,20 +445,19 @@ index fc0a9de42..3ee4e7e70 100644
 +	return __unscale_slice(delta, se->burst_score);
 +}
 +
-+static void avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se);
-+static void avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se);
++void reweight_task(struct task_struct *p, int prio);
 +
 +static void update_burst_score(struct sched_entity *se) {
-+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-+	u8 prev_score = se->burst_score;
-+	u32 penalty = se->burst_penalty;
-+	if (sched_burst_score_rounding) penalty += 0x2U;
-+	se->burst_score = penalty >> 2;
++	if (!entity_is_task(se)) return;
++	struct task_struct *p = task_of(se);
++	u8 prio = p->static_prio - MAX_RT_PRIO;
++	u8 prev_prio = min(39, prio + se->burst_score);
 +
-+	if ((se->burst_score != prev_score) && se->on_cfs_rq) {
-+		avg_vruntime_sub(cfs_rq, se);
-+		avg_vruntime_add(cfs_rq, se);
-+	}
++	se->burst_score = se->burst_penalty >> 2;
++
++	u8 new_prio = min(39, prio + se->burst_score);
++	if (new_prio != prev_prio)
++		reweight_task(p, new_prio);
 +}
 +
 +static void update_burst_penalty(struct sched_entity *se) {
@@ -509,7 +497,7 @@ index fc0a9de42..3ee4e7e70 100644
  
  int sched_thermal_decay_shift;
  static int __init setup_sched_thermal_decay_shift(char *str)
-@@ -136,12 +239,8 @@ int __weak arch_asym_cpu_priority(int cpu)
+@@ -136,12 +236,8 @@ int __weak arch_asym_cpu_priority(int cpu)
   *
   * (default: 5 msec, units: microseconds)
   */
@@ -522,7 +510,7 @@ index fc0a9de42..3ee4e7e70 100644
  
  #ifdef CONFIG_NUMA_BALANCING
  /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
-@@ -150,6 +249,87 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+@@ -150,6 +246,69 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
  
  #ifdef CONFIG_SYSCTL
  static struct ctl_table sched_fair_sysctls[] = {
@@ -533,16 +521,7 @@ index fc0a9de42..3ee4e7e70 100644
 +		.maxlen		= sizeof(u8),
 +		.mode		= 0644,
 +		.proc_handler = proc_dou8vec_minmax,
-+		.extra1		= SYSCTL_ZERO,
-+		.extra2		= SYSCTL_ONE,
-+	},
-+	{
-+		.procname	= "sched_burst_score_rounding",
-+		.data		= &sched_burst_score_rounding,
-+		.maxlen		= sizeof(u8),
-+		.mode		= 0644,
-+		.proc_handler = proc_dou8vec_minmax,
-+		.extra1		= SYSCTL_ZERO,
++		.extra1		= SYSCTL_ONE,
 +		.extra2		= SYSCTL_ONE,
 +	},
 +	{
@@ -597,20 +576,11 @@ index fc0a9de42..3ee4e7e70 100644
 +		.mode		= 0644,
 +		.proc_handler = proc_douintvec,
 +	},
-+	{
-+		.procname	= "sched_vlag_deviation_limit",
-+		.data		= &sched_vlag_deviation_limit,
-+		.maxlen		= sizeof(u8),
-+		.mode		= 0644,
-+		.proc_handler = proc_dou8vec_minmax,
-+		.extra1		= SYSCTL_ZERO,
-+		.extra2		= &thirty_two,
-+	},
 +#endif // CONFIG_SCHED_BORE
  #ifdef CONFIG_CFS_BANDWIDTH
  	{
  		.procname       = "sched_cfs_bandwidth_slice_us",
-@@ -208,6 +388,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
+@@ -208,6 +367,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w)
   *
   * This idea comes from the SD scheduler of Con Kolivas:
   */
@@ -624,7 +594,7 @@ index fc0a9de42..3ee4e7e70 100644
  static unsigned int get_update_sysctl_factor(void)
  {
  	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
-@@ -238,6 +425,7 @@ static void update_sysctl(void)
+@@ -238,6 +404,7 @@ static void update_sysctl(void)
  	SET_SYSCTL(sched_base_slice);
  #undef SET_SYSCTL
  }
@@ -632,130 +602,17 @@ index fc0a9de42..3ee4e7e70 100644
  
  void __init sched_init_granularity(void)
  {
-@@ -311,6 +499,9 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
- 	if (unlikely(se->load.weight != NICE_0_LOAD))
- 		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
- 
-+#ifdef CONFIG_SCHED_BORE
-+	if (likely(sched_bore)) delta = scale_slice(delta, se);
-+#endif // CONFIG_SCHED_BORE
- 	return delta;
- }
- 
-@@ -637,10 +828,26 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
-  *
-  * As measured, the max (key * weight) value was ~44 bits for a kernel build.
-  */
-+#if !defined(CONFIG_SCHED_BORE)
-+#define entity_weight(se) scale_load_down(se->load.weight)
-+#else // CONFIG_SCHED_BORE
-+static unsigned long entity_weight(struct sched_entity *se) {
-+	unsigned long weight = se->load.weight;
-+	if (likely(sched_bore)) weight = unscale_slice(weight, se);
-+#ifdef CONFIG_64BIT
-+	weight >>= SCHED_FIXEDPOINT_SHIFT - 3;
-+#endif // CONFIG_64BIT
-+	return weight;
-+}
-+#endif // CONFIG_SCHED_BORE
-+
- static void
- avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
--	unsigned long weight = scale_load_down(se->load.weight);
-+	unsigned long weight = entity_weight(se);
-+#ifdef CONFIG_SCHED_BORE
-+	se->burst_load = weight;
-+#endif // CONFIG_SCHED_BORE
- 	s64 key = entity_key(cfs_rq, se);
- 
- 	cfs_rq->avg_vruntime += key * weight;
-@@ -650,7 +857,12 @@ avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
- static void
- avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
--	unsigned long weight = scale_load_down(se->load.weight);
-+#if !defined(CONFIG_SCHED_BORE)
-+	unsigned long weight = entity_weight(se);
-+#else // CONFIG_SCHED_BORE
-+	unsigned long weight = se->burst_load;
-+	se->burst_load = 0;
-+#endif // CONFIG_SCHED_BORE
- 	s64 key = entity_key(cfs_rq, se);
- 
- 	cfs_rq->avg_vruntime -= key * weight;
-@@ -670,14 +882,14 @@ void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
-  * Specifically: avg_runtime() + 0 must result in entity_eligible() := true
-  * For this to be so, the result of this function must have a left bias.
-  */
--u64 avg_vruntime(struct cfs_rq *cfs_rq)
-+static u64 avg_key(struct cfs_rq *cfs_rq)
- {
- 	struct sched_entity *curr = cfs_rq->curr;
- 	s64 avg = cfs_rq->avg_vruntime;
- 	long load = cfs_rq->avg_load;
- 
- 	if (curr && curr->on_rq) {
--		unsigned long weight = scale_load_down(curr->load.weight);
-+		unsigned long weight = entity_weight(curr);
- 
- 		avg += entity_key(cfs_rq, curr) * weight;
- 		load += weight;
-@@ -687,12 +899,15 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
- 		/* sign flips effective floor / ceil */
- 		if (avg < 0)
- 			avg -= (load - 1);
--		avg = div_s64(avg, load);
-+		avg = div64_s64(avg, load);
- 	}
- 
--	return cfs_rq->min_vruntime + avg;
-+	return avg;
- }
- 
-+u64 avg_vruntime(struct cfs_rq *cfs_rq) {
-+	return cfs_rq->min_vruntime + avg_key(cfs_rq);
-+}
- /*
-  * lag_i = S - s_i = w_i * (V - v_i)
-  *
-@@ -717,6 +932,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+@@ -717,6 +884,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
  	lag = avg_vruntime(cfs_rq) - se->vruntime;
  
  	limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
 +#ifdef CONFIG_SCHED_BORE
-+	if (likely(sched_bore)) limit >>= 1;
++	limit >>= 1;
 +#endif // CONFIG_SCHED_BORE
  	se->vlag = clamp(lag, -limit, limit);
  }
  
-@@ -744,7 +962,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
- 	long load = cfs_rq->avg_load;
- 
- 	if (curr && curr->on_rq) {
--		unsigned long weight = scale_load_down(curr->load.weight);
-+		unsigned long weight = entity_weight(curr);
- 
- 		avg += entity_key(cfs_rq, curr) * weight;
- 		load += weight;
-@@ -840,10 +1058,16 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
- 	se->min_vruntime = se->vruntime;
- 	rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
- 				__entity_less, &min_vruntime_cb);
-+#ifdef CONFIG_SCHED_BORE
-+	se->on_cfs_rq = true;
-+#endif // CONFIG_SCHED_BORE
- }
- 
- static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
- {
-+#ifdef CONFIG_SCHED_BORE
-+	se->on_cfs_rq = false;
-+#endif // CONFIG_SCHED_BORE
- 	rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
- 				  &min_vruntime_cb);
- 	avg_vruntime_sub(cfs_rq, se);
-@@ -968,6 +1192,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+@@ -968,6 +1138,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
   * Scheduling class statistics methods:
   */
  #ifdef CONFIG_SMP
@@ -763,7 +620,7 @@ index fc0a9de42..3ee4e7e70 100644
  int sched_update_scaling(void)
  {
  	unsigned int factor = get_update_sysctl_factor();
-@@ -979,6 +1204,7 @@ int sched_update_scaling(void)
+@@ -979,6 +1150,7 @@ int sched_update_scaling(void)
  
  	return 0;
  }
@@ -771,7 +628,7 @@ index fc0a9de42..3ee4e7e70 100644
  #endif
  #endif
  
-@@ -1178,7 +1404,13 @@ static void update_curr(struct cfs_rq *cfs_rq)
+@@ -1178,7 +1350,13 @@ static void update_curr(struct cfs_rq *cfs_rq)
  	if (unlikely(delta_exec <= 0))
  		return;
  
@@ -785,54 +642,17 @@ index fc0a9de42..3ee4e7e70 100644
  	update_deadline(cfs_rq, curr);
  	update_min_vruntime(cfs_rq);
  
-@@ -5170,8 +5402,8 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
- static void
- place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- {
--	u64 vslice, vruntime = avg_vruntime(cfs_rq);
--	s64 lag = 0;
-+	s64 lag = 0, key = avg_key(cfs_rq);
-+	u64 vslice, vruntime = cfs_rq->min_vruntime + key;
- 
- 	se->slice = sysctl_sched_base_slice;
- 	vslice = calc_delta_fair(se->slice, se);
-@@ -5184,6 +5416,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+@@ -5184,6 +5362,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
  	 *
  	 * EEVDF: placement strategy #1 / #2
  	 */
 +#ifdef CONFIG_SCHED_BORE
-+	if (unlikely(!sched_bore) || se->vlag)
++	if (se->vlag)
 +#endif // CONFIG_SCHED_BORE
  	if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
  		struct sched_entity *curr = cfs_rq->curr;
  		unsigned long load;
-@@ -5244,12 +5479,22 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
- 		 */
- 		load = cfs_rq->avg_load;
- 		if (curr && curr->on_rq)
--			load += scale_load_down(curr->load.weight);
-+			load += entity_weight(curr);
- 
--		lag *= load + scale_load_down(se->load.weight);
-+		lag *= load + entity_weight(se);
-+#if !defined(CONFIG_SCHED_BORE)
- 		if (WARN_ON_ONCE(!load))
-+#else // CONFIG_SCHED_BORE
-+		if (unlikely(!load))
-+#endif // CONFIG_SCHED_BORE
- 			load = 1;
--		lag = div_s64(lag, load);
-+		lag = div64_s64(lag, load);
-+#ifdef CONFIG_SCHED_BORE
-+		if (likely(sched_bore)) {
-+			s64 limit = vslice << sched_vlag_deviation_limit;
-+			lag = clamp(lag, -limit, limit);
-+		}
-+#endif // CONFIG_SCHED_BORE
- 	}
- 
- 	se->vruntime = vruntime - lag;
-@@ -6816,6 +7061,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+@@ -6816,6 +6997,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  	bool was_sched_idle = sched_idle_rq(rq);
  
  	util_est_dequeue(&rq->cfs, p);
@@ -847,7 +667,7 @@ index fc0a9de42..3ee4e7e70 100644
  
  	for_each_sched_entity(se) {
  		cfs_rq = cfs_rq_of(se);
-@@ -8565,16 +8818,25 @@ static void yield_task_fair(struct rq *rq)
+@@ -8565,16 +8754,25 @@ static void yield_task_fair(struct rq *rq)
  	/*
  	 * Are we the only task in the tree?
  	 */
@@ -873,7 +693,7 @@ index fc0a9de42..3ee4e7e70 100644
  	/*
  	 * Tell update_rq_clock() that we've just updated,
  	 * so we don't do microscopic update in schedule()
-@@ -12664,6 +12926,9 @@ static void task_fork_fair(struct task_struct *p)
+@@ -12664,6 +12862,9 @@ static void task_fork_fair(struct task_struct *p)
  	curr = cfs_rq->curr;
  	if (curr)
  		update_curr(cfs_rq);
diff --git a/patches/cachyos/0001-cachyos-base-all.patch b/patches/cachyos/0001-cachyos-base-all.patch
index 148a8a7..84d0f67 100644
--- a/patches/cachyos/0001-cachyos-base-all.patch
+++ b/patches/cachyos/0001-cachyos-base-all.patch
@@ -1,7 +1,1204 @@
-From 8f03bb4df21c5746b9f1c3e399faa3c932737e4f Mon Sep 17 00:00:00 2001
+From 2b7dc22b0a950292985c4d5118c5eeaa51ea2918 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 15 Mar 2024 20:08:47 +0100
-Subject: [PATCH 1/7] amd-pstate
+Date: Wed, 3 Apr 2024 17:06:09 +0200
+Subject: [PATCH 1/8] aex-xts
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/x86/Kconfig.assembler           |  10 +
+ arch/x86/crypto/Makefile             |   3 +-
+ arch/x86/crypto/aes-xts-avx-x86_64.S | 838 +++++++++++++++++++++++++++
+ arch/x86/crypto/aesni-intel_glue.c   | 270 ++++++++-
+ 4 files changed, 1118 insertions(+), 3 deletions(-)
+ create mode 100644 arch/x86/crypto/aes-xts-avx-x86_64.S
+
+diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
+index 8ad41da301e5..59aedf32c4ea 100644
+--- a/arch/x86/Kconfig.assembler
++++ b/arch/x86/Kconfig.assembler
+@@ -25,6 +25,16 @@ config AS_GFNI
+ 	help
+ 	  Supported by binutils >= 2.30 and LLVM integrated assembler
+ 
++config AS_VAES
++	def_bool $(as-instr,vaesenc %ymm0$(comma)%ymm1$(comma)%ymm2)
++	help
++	  Supported by binutils >= 2.30 and LLVM integrated assembler
++
++config AS_VPCLMULQDQ
++	def_bool $(as-instr,vpclmulqdq \$0x10$(comma)%ymm0$(comma)%ymm1$(comma)%ymm2)
++	help
++	  Supported by binutils >= 2.30 and LLVM integrated assembler
++
+ config AS_WRUSS
+ 	def_bool $(as-instr,wrussq %rax$(comma)(%rbx))
+ 	help
+diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
+index 9aa46093c91b..9c5ce5613738 100644
+--- a/arch/x86/crypto/Makefile
++++ b/arch/x86/crypto/Makefile
+@@ -48,7 +48,8 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
+ 
+ obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
+ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
+-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
++aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \
++	aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o
+ 
+ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
+ sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o
+diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S
+new file mode 100644
+index 000000000000..b8005d0205f8
+--- /dev/null
++++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
+@@ -0,0 +1,838 @@
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++/*
++ * AES-XTS for modern x86_64 CPUs
++ *
++ * Copyright 2024 Google LLC
++ *
++ * Author: Eric Biggers <ebiggers@google.com>
++ */
++
++/*
++ * This file implements AES-XTS for modern x86_64 CPUs.  To handle the
++ * complexities of coding for x86 SIMD, e.g. where every vector length needs
++ * different code, it uses a macro to generate several implementations that
++ * share similar source code but are targeted at different CPUs, listed below:
++ *
++ * AES-NI + AVX
++ *    - 128-bit vectors (1 AES block per vector)
++ *    - VEX-coded instructions
++ *    - xmm0-xmm15
++ *    - This is for older CPUs that lack VAES but do have AVX.
++ *
++ * VAES + VPCLMULQDQ + AVX2
++ *    - 256-bit vectors (2 AES blocks per vector)
++ *    - VEX-coded instructions
++ *    - ymm0-ymm15
++ *    - This is for CPUs that have VAES but lack AVX512 or AVX10,
++ *      e.g. Intel's Alder Lake and AMD's Zen 3.
++ *
++ * VAES + VPCLMULQDQ + AVX10/256 + BMI2
++ *    - 256-bit vectors (2 AES blocks per vector)
++ *    - EVEX-coded instructions
++ *    - ymm0-ymm31
++ *    - This is for CPUs that have AVX512 but where using zmm registers causes
++ *      downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
++ *    - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
++ *      To avoid confusion with 512-bit, we just write AVX10/256.
++ *
++ * VAES + VPCLMULQDQ + AVX10/512 + BMI2
++ *    - Same as the previous one, but upgrades to 512-bit vectors
++ *      (4 AES blocks per vector) in zmm0-zmm31.
++ *    - This is for CPUs that have good AVX512 or AVX10/512 support.
++ *
++ * This file doesn't have an implementation for AES-NI alone (without AVX), as
++ * the lack of VEX would make all the assembly code different.
++ *
++ * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of
++ * the XTS tweaks.  This avoids a bottleneck.  Currently there don't seem to be
++ * any CPUs that support VAES but not VPCLMULQDQ.  If that changes, we might
++ * need to start also providing an implementation using VAES alone.
++ *
++ * The AES-XTS implementations in this file support everything required by the
++ * crypto API, including support for arbitrary input lengths and multi-part
++ * processing.  However, they are most heavily optimized for the common case of
++ * power-of-2 length inputs that are processed in a single part (disk sectors).
++ */
++
++#include <linux/linkage.h>
++#include <linux/cfi_types.h>
++
++.section .rodata
++.p2align 4
++.Lgf_poly:
++	// The low 64 bits of this value represent the polynomial x^7 + x^2 + x
++	// + 1.  It is the value that must be XOR'd into the low 64 bits of the
++	// tweak each time a 1 is carried out of the high 64 bits.
++	//
++	// The high 64 bits of this value is just the internal carry bit that
++	// exists when there's a carry out of the low 64 bits of the tweak.
++	.quad	0x87, 1
++
++	// This table contains constants for vpshufb and vpblendvb, used to
++	// handle variable byte shifts and blending during ciphertext stealing
++	// on CPUs that don't support AVX10-style masking.
++.Lcts_permute_table:
++	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
++	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
++	.byte	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
++	.byte	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
++	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
++	.byte	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
++.text
++
++// Function parameters
++.set	KEY,		%rdi	// Initially points to crypto_aes_ctx, then is
++				// advanced to point directly to the round keys
++.set	SRC,		%rsi	// Pointer to next source data
++.set	DST,		%rdx	// Pointer to next destination data
++.set	LEN,		%rcx	// Remaining length in bytes
++.set	TWEAK,		%r8	// Pointer to next tweak
++
++// %r9d holds the AES key length in bytes.
++.set	KEYLEN,		%r9d
++
++// %rax and %r10-r11 are available as temporaries.
++
++.macro	_define_Vi	i
++.if VL == 16
++	.set	V\i,		%xmm\i
++.elseif VL == 32
++	.set	V\i,		%ymm\i
++.elseif VL == 64
++	.set	V\i,		%zmm\i
++.else
++	.error "Unsupported Vector Length (VL)"
++.endif
++.endm
++
++.macro _define_aliases
++	// Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
++	// are available, that map to the xmm, ymm, or zmm registers according
++	// to the selected Vector Length (VL).
++	_define_Vi	0
++	_define_Vi	1
++	_define_Vi	2
++	_define_Vi	3
++	_define_Vi	4
++	_define_Vi	5
++	_define_Vi	6
++	_define_Vi	7
++	_define_Vi	8
++	_define_Vi	9
++	_define_Vi	10
++	_define_Vi	11
++	_define_Vi	12
++	_define_Vi	13
++	_define_Vi	14
++	_define_Vi	15
++.if USE_AVX10
++	_define_Vi	16
++	_define_Vi	17
++	_define_Vi	18
++	_define_Vi	19
++	_define_Vi	20
++	_define_Vi	21
++	_define_Vi	22
++	_define_Vi	23
++	_define_Vi	24
++	_define_Vi	25
++	_define_Vi	26
++	_define_Vi	27
++	_define_Vi	28
++	_define_Vi	29
++	_define_Vi	30
++	_define_Vi	31
++.endif
++
++	// V0-V3 hold the data blocks during the main loop, or temporary values
++	// otherwise.  V4-V5 hold temporary values.
++
++	// V6-V9 hold XTS tweaks.  Each 128-bit lane holds one tweak.
++	.set	TWEAK0_XMM,	%xmm6
++	.set	TWEAK0,		V6
++	.set	TWEAK1_XMM,	%xmm7
++	.set	TWEAK1,		V7
++	.set	TWEAK2,		V8
++	.set	TWEAK3,		V9
++
++	// V10-V13 are used for computing the next values of TWEAK[0-3].
++	.set	NEXT_TWEAK0,	V10
++	.set	NEXT_TWEAK1,	V11
++	.set	NEXT_TWEAK2,	V12
++	.set	NEXT_TWEAK3,	V13
++
++	// V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
++	.set	GF_POLY_XMM,	%xmm14
++	.set	GF_POLY,	V14
++
++	// V15 holds the first AES round key, copied to all 128-bit lanes.
++	.set	KEY0_XMM,	%xmm15
++	.set	KEY0,		V15
++
++	// If 32 SIMD registers are available, then V16-V29 hold the remaining
++	// AES round keys, copied to all 128-bit lanes.
++.if USE_AVX10
++	.set	KEY1_XMM,	%xmm16
++	.set	KEY1,		V16
++	.set	KEY2_XMM,	%xmm17
++	.set	KEY2,		V17
++	.set	KEY3_XMM,	%xmm18
++	.set	KEY3,		V18
++	.set	KEY4_XMM,	%xmm19
++	.set	KEY4,		V19
++	.set	KEY5_XMM,	%xmm20
++	.set	KEY5,		V20
++	.set	KEY6_XMM,	%xmm21
++	.set	KEY6,		V21
++	.set	KEY7_XMM,	%xmm22
++	.set	KEY7,		V22
++	.set	KEY8_XMM,	%xmm23
++	.set	KEY8,		V23
++	.set	KEY9_XMM,	%xmm24
++	.set	KEY9,		V24
++	.set	KEY10_XMM,	%xmm25
++	.set	KEY10,		V25
++	.set	KEY11_XMM,	%xmm26
++	.set	KEY11,		V26
++	.set	KEY12_XMM,	%xmm27
++	.set	KEY12,		V27
++	.set	KEY13_XMM,	%xmm28
++	.set	KEY13,		V28
++	.set	KEY14_XMM,	%xmm29
++	.set	KEY14,		V29
++.endif
++	// V30-V31 are currently unused.
++.endm
++
++// Move a vector between memory and a register.
++.macro	_vmovdqu	src, dst
++.if VL < 64
++	vmovdqu		\src, \dst
++.else
++	vmovdqu8	\src, \dst
++.endif
++.endm
++
++// Broadcast a 128-bit value into a vector.
++.macro	_vbroadcast128	src, dst
++.if VL == 16 && !USE_AVX10
++	vmovdqu		\src, \dst
++.elseif VL == 32 && !USE_AVX10
++	vbroadcasti128	\src, \dst
++.else
++	vbroadcasti32x4	\src, \dst
++.endif
++.endm
++
++// XOR two vectors together.
++.macro	_vpxor	src1, src2, dst
++.if USE_AVX10
++	vpxord		\src1, \src2, \dst
++.else
++	vpxor		\src1, \src2, \dst
++.endif
++.endm
++
++// XOR three vectors together.
++.macro	_xor3	src1, src2, src3_and_dst
++.if USE_AVX10
++	// vpternlogd with immediate 0x96 is a three-argument XOR.
++	vpternlogd	$0x96, \src1, \src2, \src3_and_dst
++.else
++	vpxor		\src1, \src3_and_dst, \src3_and_dst
++	vpxor		\src2, \src3_and_dst, \src3_and_dst
++.endif
++.endm
++
++// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
++// (by multiplying by the polynomial 'x') and write it to \dst.
++.macro	_next_tweak	src, tmp, dst
++	vpshufd		$0x13, \src, \tmp
++	vpaddq		\src, \src, \dst
++	vpsrad		$31, \tmp, \tmp
++	vpand		GF_POLY_XMM, \tmp, \tmp
++	vpxor		\tmp, \dst, \dst
++.endm
++
++// Given the XTS tweak(s) in the vector \src, compute the next vector of
++// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst.
++//
++// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute
++// all tweaks in the vector in parallel.  If VL=16, we just do the regular
++// computation without vpclmulqdq, as it's the faster method for a single tweak.
++.macro	_next_tweakvec	src, tmp1, tmp2, dst
++.if VL == 16
++	_next_tweak	\src, \tmp1, \dst
++.else
++	vpsrlq		$64 - VL/16, \src, \tmp1
++	vpclmulqdq	$0x01, GF_POLY, \tmp1, \tmp2
++	vpslldq		$8, \tmp1, \tmp1
++	vpsllq		$VL/16, \src, \dst
++	_xor3		\tmp1, \tmp2, \dst
++.endif
++.endm
++
++// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
++// store them in the vector registers TWEAK0-TWEAK3.  Clobbers V0-V5.
++.macro	_compute_first_set_of_tweaks
++	vmovdqu		(TWEAK), TWEAK0_XMM
++	_vbroadcast128	.Lgf_poly(%rip), GF_POLY
++.if VL == 16
++	// With VL=16, multiplying by x serially is fastest.
++	_next_tweak	TWEAK0, %xmm0, TWEAK1
++	_next_tweak	TWEAK1, %xmm0, TWEAK2
++	_next_tweak	TWEAK2, %xmm0, TWEAK3
++.else
++.if VL == 32
++	// Compute the second block of TWEAK0.
++	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
++	vinserti128	$1, %xmm1, TWEAK0, TWEAK0
++.elseif VL == 64
++	// Compute the remaining blocks of TWEAK0.
++	_next_tweak	TWEAK0_XMM, %xmm0, %xmm1
++	_next_tweak	%xmm1, %xmm0, %xmm2
++	_next_tweak	%xmm2, %xmm0, %xmm3
++	vinserti32x4	$1, %xmm1, TWEAK0, TWEAK0
++	vinserti32x4	$2, %xmm2, TWEAK0, TWEAK0
++	vinserti32x4	$3, %xmm3, TWEAK0, TWEAK0
++.endif
++	// Compute TWEAK[1-3] from TWEAK0.
++	vpsrlq		$64 - 1*VL/16, TWEAK0, V0
++	vpsrlq		$64 - 2*VL/16, TWEAK0, V2
++	vpsrlq		$64 - 3*VL/16, TWEAK0, V4
++	vpclmulqdq	$0x01, GF_POLY, V0, V1
++	vpclmulqdq	$0x01, GF_POLY, V2, V3
++	vpclmulqdq	$0x01, GF_POLY, V4, V5
++	vpslldq		$8, V0, V0
++	vpslldq		$8, V2, V2
++	vpslldq		$8, V4, V4
++	vpsllq		$1*VL/16, TWEAK0, TWEAK1
++	vpsllq		$2*VL/16, TWEAK0, TWEAK2
++	vpsllq		$3*VL/16, TWEAK0, TWEAK3
++.if USE_AVX10
++	vpternlogd	$0x96, V0, V1, TWEAK1
++	vpternlogd	$0x96, V2, V3, TWEAK2
++	vpternlogd	$0x96, V4, V5, TWEAK3
++.else
++	vpxor		V0, TWEAK1, TWEAK1
++	vpxor		V2, TWEAK2, TWEAK2
++	vpxor		V4, TWEAK3, TWEAK3
++	vpxor		V1, TWEAK1, TWEAK1
++	vpxor		V3, TWEAK2, TWEAK2
++	vpxor		V5, TWEAK3, TWEAK3
++.endif
++.endif
++.endm
++
++// Do one step in computing the next set of tweaks using the method of just
++// multiplying by x repeatedly (the same method _next_tweak uses).
++.macro	_tweak_step_mulx	i
++.if \i == 0
++	.set PREV_TWEAK, TWEAK3
++	.set NEXT_TWEAK, NEXT_TWEAK0
++.elseif \i == 5
++	.set PREV_TWEAK, NEXT_TWEAK0
++	.set NEXT_TWEAK, NEXT_TWEAK1
++.elseif \i == 10
++	.set PREV_TWEAK, NEXT_TWEAK1
++	.set NEXT_TWEAK, NEXT_TWEAK2
++.elseif \i == 15
++	.set PREV_TWEAK, NEXT_TWEAK2
++	.set NEXT_TWEAK, NEXT_TWEAK3
++.endif
++.if \i < 20 && \i % 5 == 0
++	vpshufd		$0x13, PREV_TWEAK, V5
++.elseif \i < 20 && \i % 5 == 1
++	vpaddq		PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
++.elseif \i < 20 && \i % 5 == 2
++	vpsrad		$31, V5, V5
++.elseif \i < 20 && \i % 5 == 3
++	vpand		GF_POLY, V5, V5
++.elseif \i < 20 && \i % 5 == 4
++	vpxor		V5, NEXT_TWEAK, NEXT_TWEAK
++.elseif \i == 1000
++	vmovdqa		NEXT_TWEAK0, TWEAK0
++	vmovdqa		NEXT_TWEAK1, TWEAK1
++	vmovdqa		NEXT_TWEAK2, TWEAK2
++	vmovdqa		NEXT_TWEAK3, TWEAK3
++.endif
++.endm
++
++// Do one step in computing the next set of tweaks using the VPCLMULQDQ method
++// (the same method _next_tweakvec uses for VL > 16).  This means multiplying
++// each tweak by x^(4*VL/16) independently.  Since 4*VL/16 is a multiple of 8
++// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
++// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
++.macro	_tweak_step_pclmul	i
++.if \i == 2
++	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
++.elseif \i == 4
++	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
++.elseif \i == 6
++	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
++.elseif \i == 8
++	vpsrldq		$(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
++.elseif \i == 10
++	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
++.elseif \i == 12
++	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
++.elseif \i == 14
++	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
++.elseif \i == 16
++	vpclmulqdq	$0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
++.elseif \i == 1000
++	vpslldq		$(4*VL/16) / 8, TWEAK0, TWEAK0
++	vpslldq		$(4*VL/16) / 8, TWEAK1, TWEAK1
++	vpslldq		$(4*VL/16) / 8, TWEAK2, TWEAK2
++	vpslldq		$(4*VL/16) / 8, TWEAK3, TWEAK3
++	_vpxor		NEXT_TWEAK0, TWEAK0, TWEAK0
++	_vpxor		NEXT_TWEAK1, TWEAK1, TWEAK1
++	_vpxor		NEXT_TWEAK2, TWEAK2, TWEAK2
++	_vpxor		NEXT_TWEAK3, TWEAK3, TWEAK3
++.endif
++.endm
++
++// _tweak_step does one step of the computation of the next set of tweaks from
++// TWEAK[0-3].  To complete all steps, this must be invoked with \i values 0
++// through at least 19, then 1000 which signals the last step.
++//
++// This is used to interleave the computation of the next set of tweaks with the
++// AES en/decryptions, which increases performance in some cases.
++.macro	_tweak_step	i
++.if VL == 16
++	_tweak_step_mulx	\i
++.else
++	_tweak_step_pclmul	\i
++.endif
++.endm
++
++// Load the round keys: just the first one if !USE_AVX10, otherwise all of them.
++.macro	_load_round_keys
++	_vbroadcast128	0*16(KEY), KEY0
++.if USE_AVX10
++	_vbroadcast128	1*16(KEY), KEY1
++	_vbroadcast128	2*16(KEY), KEY2
++	_vbroadcast128	3*16(KEY), KEY3
++	_vbroadcast128	4*16(KEY), KEY4
++	_vbroadcast128	5*16(KEY), KEY5
++	_vbroadcast128	6*16(KEY), KEY6
++	_vbroadcast128	7*16(KEY), KEY7
++	_vbroadcast128	8*16(KEY), KEY8
++	_vbroadcast128	9*16(KEY), KEY9
++	_vbroadcast128	10*16(KEY), KEY10
++	// Note: if it's AES-128 or AES-192, the last several round keys won't
++	// be used.  We do the loads anyway to save a conditional jump.
++	_vbroadcast128	11*16(KEY), KEY11
++	_vbroadcast128	12*16(KEY), KEY12
++	_vbroadcast128	13*16(KEY), KEY13
++	_vbroadcast128	14*16(KEY), KEY14
++.endif
++.endm
++
++// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
++// on the block(s) in \data using the round key(s) in \key.  The register length
++// determines the number of AES blocks en/decrypted.
++.macro	_vaes	enc, last, key, data
++.if \enc
++.if \last
++	vaesenclast	\key, \data, \data
++.else
++	vaesenc		\key, \data, \data
++.endif
++.else
++.if \last
++	vaesdeclast	\key, \data, \data
++.else
++	vaesdec		\key, \data, \data
++.endif
++.endif
++.endm
++
++// Do a single round of AES en/decryption on the block(s) in \data, using the
++// same key for all block(s).  The round key is loaded from the appropriate
++// register or memory location for round \i.  May clobber V4.
++.macro _vaes_1x		enc, last, i, xmm_suffix, data
++.if USE_AVX10
++	_vaes		\enc, \last, KEY\i\xmm_suffix, \data
++.else
++.ifnb \xmm_suffix
++	_vaes		\enc, \last, \i*16(KEY), \data
++.else
++	_vbroadcast128	\i*16(KEY), V4
++	_vaes		\enc, \last, V4, \data
++.endif
++.endif
++.endm
++
++// Do a single round of AES en/decryption on the blocks in registers V0-V3,
++// using the same key for all blocks.  The round key is loaded from the
++// appropriate register or memory location for round \i.  In addition, does step
++// \i of the computation of the next set of tweaks.  May clobber V4.
++.macro	_vaes_4x	enc, last, i
++.if USE_AVX10
++	_tweak_step	(2*(\i-1))
++	_vaes		\enc, \last, KEY\i, V0
++	_vaes		\enc, \last, KEY\i, V1
++	_tweak_step	(2*(\i-1) + 1)
++	_vaes		\enc, \last, KEY\i, V2
++	_vaes		\enc, \last, KEY\i, V3
++.else
++	_vbroadcast128	\i*16(KEY), V4
++	_tweak_step	(2*(\i-1))
++	_vaes		\enc, \last, V4, V0
++	_vaes		\enc, \last, V4, V1
++	_tweak_step	(2*(\i-1) + 1)
++	_vaes		\enc, \last, V4, V2
++	_vaes		\enc, \last, V4, V3
++.endif
++.endm
++
++// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
++// then XOR with \tweak again) of the block(s) in \data.  To process a single
++// block, use xmm registers and set \xmm_suffix=_XMM.  To process a vector of
++// length VL, use V* registers and leave \xmm_suffix empty.  May clobber V4.
++.macro	_aes_crypt	enc, xmm_suffix, tweak, data
++	_xor3		KEY0\xmm_suffix, \tweak, \data
++	_vaes_1x	\enc, 0, 1, \xmm_suffix, \data
++	_vaes_1x	\enc, 0, 2, \xmm_suffix, \data
++	_vaes_1x	\enc, 0, 3, \xmm_suffix, \data
++	_vaes_1x	\enc, 0, 4, \xmm_suffix, \data
++	_vaes_1x	\enc, 0, 5, \xmm_suffix, \data
++	_vaes_1x	\enc, 0, 6, \xmm_suffix, \data
++	_vaes_1x	\enc, 0, 7, \xmm_suffix, \data
++	_vaes_1x	\enc, 0, 8, \xmm_suffix, \data
++	_vaes_1x	\enc, 0, 9, \xmm_suffix, \data
++	cmp		$24, KEYLEN
++	jle		.Laes_128_or_192\@
++	_vaes_1x	\enc, 0, 10, \xmm_suffix, \data
++	_vaes_1x	\enc, 0, 11, \xmm_suffix, \data
++	_vaes_1x	\enc, 0, 12, \xmm_suffix, \data
++	_vaes_1x	\enc, 0, 13, \xmm_suffix, \data
++	_vaes_1x	\enc, 1, 14, \xmm_suffix, \data
++	jmp		.Laes_done\@
++.Laes_128_or_192\@:
++	je		.Laes_192\@
++	_vaes_1x	\enc, 1, 10, \xmm_suffix, \data
++	jmp		.Laes_done\@
++.Laes_192\@:
++	_vaes_1x	\enc, 0, 10, \xmm_suffix, \data
++	_vaes_1x	\enc, 0, 11, \xmm_suffix, \data
++	_vaes_1x	\enc, 1, 12, \xmm_suffix, \data
++.Laes_done\@:
++	_vpxor		\tweak, \data, \data
++.endm
++
++.macro	_aes_xts_crypt	enc
++	_define_aliases
++
++	// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
++	movl		480(KEY), KEYLEN
++
++	// If decrypting, advance KEY to the decryption round keys.
++.if !\enc
++	add		$240, KEY
++.endif
++
++	// Check whether the data length is a multiple of the AES block length.
++	test		$15, LEN
++	jnz		.Lneed_cts\@
++.Lxts_init\@:
++
++	// Cache as many round keys as possible.
++	_load_round_keys
++
++	// Compute the first set of tweaks TWEAK[0-3].
++	_compute_first_set_of_tweaks
++
++	sub		$4*VL, LEN
++	jl		.Lhandle_remainder\@
++
++.Lmain_loop\@:
++	// This is the main loop, en/decrypting 4*VL bytes per iteration.
++
++	// XOR each source block with its tweak and the first round key.
++.if USE_AVX10
++	vmovdqu8	0*VL(SRC), V0
++	vmovdqu8	1*VL(SRC), V1
++	vmovdqu8	2*VL(SRC), V2
++	vmovdqu8	3*VL(SRC), V3
++	vpternlogd	$0x96, TWEAK0, KEY0, V0
++	vpternlogd	$0x96, TWEAK1, KEY0, V1
++	vpternlogd	$0x96, TWEAK2, KEY0, V2
++	vpternlogd	$0x96, TWEAK3, KEY0, V3
++.else
++	vpxor		0*VL(SRC), KEY0, V0
++	vpxor		1*VL(SRC), KEY0, V1
++	vpxor		2*VL(SRC), KEY0, V2
++	vpxor		3*VL(SRC), KEY0, V3
++	vpxor		TWEAK0, V0, V0
++	vpxor		TWEAK1, V1, V1
++	vpxor		TWEAK2, V2, V2
++	vpxor		TWEAK3, V3, V3
++.endif
++	// Do all the AES rounds on the data blocks, interleaved with
++	// the computation of the next set of tweaks.
++	_vaes_4x	\enc, 0, 1
++	_vaes_4x	\enc, 0, 2
++	_vaes_4x	\enc, 0, 3
++	_vaes_4x	\enc, 0, 4
++	_vaes_4x	\enc, 0, 5
++	_vaes_4x	\enc, 0, 6
++	_vaes_4x	\enc, 0, 7
++	_vaes_4x	\enc, 0, 8
++	_vaes_4x	\enc, 0, 9
++	// Try to optimize for AES-256 by keeping the code for AES-128 and
++	// AES-192 out-of-line.
++	cmp		$24, KEYLEN
++	jle		.Lencrypt_4x_aes_128_or_192\@
++	_vaes_4x	\enc, 0, 10
++	_vaes_4x	\enc, 0, 11
++	_vaes_4x	\enc, 0, 12
++	_vaes_4x	\enc, 0, 13
++	_vaes_4x	\enc, 1, 14
++.Lencrypt_4x_done\@:
++
++	// XOR in the tweaks again.
++	_vpxor		TWEAK0, V0, V0
++	_vpxor		TWEAK1, V1, V1
++	_vpxor		TWEAK2, V2, V2
++	_vpxor		TWEAK3, V3, V3
++
++	// Store the destination blocks.
++	_vmovdqu	V0, 0*VL(DST)
++	_vmovdqu	V1, 1*VL(DST)
++	_vmovdqu	V2, 2*VL(DST)
++	_vmovdqu	V3, 3*VL(DST)
++
++	// Finish computing the next set of tweaks.
++	_tweak_step	1000
++
++	add		$4*VL, SRC
++	add		$4*VL, DST
++	sub		$4*VL, LEN
++	jge		.Lmain_loop\@
++
++	// Check for the uncommon case where the data length isn't a multiple of
++	// 4*VL.  Handle it out-of-line in order to optimize for the common
++	// case.  In the common case, just fall through to the ret.
++	test		$4*VL-1, LEN
++	jnz		.Lhandle_remainder\@
++.Ldone\@:
++	// Store the next tweak back to *TWEAK to support continuation calls.
++	vmovdqu		TWEAK0_XMM, (TWEAK)
++.if VL > 16
++	vzeroupper
++.endif
++	RET
++
++.Lhandle_remainder\@:
++	add		$4*VL, LEN	// Undo the extra sub from earlier.
++
++	// En/decrypt any remaining full blocks, one vector at a time.
++.if VL > 16
++	sub		$VL, LEN
++	jl		.Lvec_at_a_time_done\@
++.Lvec_at_a_time\@:
++	_vmovdqu	(SRC), V0
++	_aes_crypt	\enc, , TWEAK0, V0
++	_vmovdqu	V0, (DST)
++	_next_tweakvec	TWEAK0, V0, V1, TWEAK0
++	add		$VL, SRC
++	add		$VL, DST
++	sub		$VL, LEN
++	jge		.Lvec_at_a_time\@
++.Lvec_at_a_time_done\@:
++	add		$VL-16, LEN	// Undo the extra sub from earlier.
++.else
++	sub		$16, LEN
++.endif
++
++	// En/decrypt any remaining full blocks, one at a time.
++	jl		.Lblock_at_a_time_done\@
++.Lblock_at_a_time\@:
++	vmovdqu		(SRC), %xmm0
++	_aes_crypt	\enc, _XMM, TWEAK0_XMM, %xmm0
++	vmovdqu		%xmm0, (DST)
++	_next_tweak	TWEAK0_XMM, %xmm0, TWEAK0_XMM
++	add		$16, SRC
++	add		$16, DST
++	sub		$16, LEN
++	jge		.Lblock_at_a_time\@
++.Lblock_at_a_time_done\@:
++	add		$16, LEN	// Undo the extra sub from earlier.
++
++.Lfull_blocks_done\@:
++	// Now 0 <= LEN <= 15.  If LEN is nonzero, do ciphertext stealing to
++	// process the last 16 + LEN bytes.  If LEN is zero, we're done.
++	test		LEN, LEN
++	jnz		.Lcts\@
++	jmp		.Ldone\@
++
++	// Out-of-line handling of AES-128 and AES-192
++.Lencrypt_4x_aes_128_or_192\@:
++	jz		.Lencrypt_4x_aes_192\@
++	_vaes_4x	\enc, 1, 10
++	jmp		.Lencrypt_4x_done\@
++.Lencrypt_4x_aes_192\@:
++	_vaes_4x	\enc, 0, 10
++	_vaes_4x	\enc, 0, 11
++	_vaes_4x	\enc, 1, 12
++	jmp		.Lencrypt_4x_done\@
++
++.Lneed_cts\@:
++	// The data length isn't a multiple of the AES block length, so
++	// ciphertext stealing (CTS) will be needed.  Subtract one block from
++	// LEN so that the main loop doesn't process the last full block.  The
++	// CTS step will process it specially along with the partial block.
++	sub		$16, LEN
++	jmp		.Lxts_init\@
++
++.Lcts\@:
++	// Do ciphertext stealing (CTS) to en/decrypt the last full block and
++	// the partial block.  CTS needs two tweaks.  TWEAK0_XMM contains the
++	// next tweak; compute the one after that.  Decryption uses these two
++	// tweaks in reverse order, so also define aliases to handle that.
++	_next_tweak	TWEAK0_XMM, %xmm0, TWEAK1_XMM
++.if \enc
++	.set		CTS_TWEAK0,	TWEAK0_XMM
++	.set		CTS_TWEAK1,	TWEAK1_XMM
++.else
++	.set		CTS_TWEAK0,	TWEAK1_XMM
++	.set		CTS_TWEAK1,	TWEAK0_XMM
++.endif
++
++	// En/decrypt the last full block.
++	vmovdqu		(SRC), %xmm0
++	_aes_crypt	\enc, _XMM, CTS_TWEAK0, %xmm0
++
++.if USE_AVX10
++	// Create a mask that has the first LEN bits set.
++	mov		$-1, %rax
++	bzhi		LEN, %rax, %rax
++	kmovq		%rax, %k1
++
++	// Swap the first LEN bytes of the above result with the partial block.
++	// Note that to support in-place en/decryption, the load from the src
++	// partial block must happen before the store to the dst partial block.
++	vmovdqa		%xmm0, %xmm1
++	vmovdqu8	16(SRC), %xmm0{%k1}
++	vmovdqu8	%xmm1, 16(DST){%k1}
++.else
++	lea		.Lcts_permute_table(%rip), %rax
++
++	// Load the src partial block, left-aligned.  Note that to support
++	// in-place en/decryption, this must happen before the store to the dst
++	// partial block.
++	vmovdqu		(SRC, LEN, 1), %xmm1
++
++	// Shift the first LEN bytes of the en/decryption of the last full block
++	// to the end of a register, then store it to DST+LEN.  This stores the
++	// dst partial block.  It also writes to the second part of the dst last
++	// full block, but that part is overwritten later.
++	vpshufb		(%rax, LEN, 1), %xmm0, %xmm2
++	vmovdqu		%xmm2, (DST, LEN, 1)
++
++	// Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
++	sub		LEN, %rax
++	vmovdqu		32(%rax), %xmm3
++
++	// Shift the src partial block to the beginning of its register.
++	vpshufb		%xmm3, %xmm1, %xmm1
++
++	// Do a blend to generate the src partial block followed by the second
++	// part of the en/decryption of the last full block.
++	vpblendvb	%xmm3, %xmm0, %xmm1, %xmm0
++.endif
++	// En/decrypt again and store the last full block.
++	_aes_crypt	\enc, _XMM, CTS_TWEAK1, %xmm0
++	vmovdqu		%xmm0, (DST)
++	jmp		.Ldone\@
++.endm
++
++// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
++//			   u8 iv[AES_BLOCK_SIZE]);
++SYM_FUNC_START(aes_xts_encrypt_iv)
++	vmovdqu		(%rsi), %xmm0
++	vpxor		0*16(%rdi), %xmm0, %xmm0
++	vaesenc		1*16(%rdi), %xmm0, %xmm0
++	vaesenc		2*16(%rdi), %xmm0, %xmm0
++	vaesenc		3*16(%rdi), %xmm0, %xmm0
++	vaesenc		4*16(%rdi), %xmm0, %xmm0
++	vaesenc		5*16(%rdi), %xmm0, %xmm0
++	vaesenc		6*16(%rdi), %xmm0, %xmm0
++	vaesenc		7*16(%rdi), %xmm0, %xmm0
++	vaesenc		8*16(%rdi), %xmm0, %xmm0
++	vaesenc		9*16(%rdi), %xmm0, %xmm0
++	cmpl		$24, 480(%rdi)
++	jle		.Lencrypt_iv_aes_128_or_192
++	vaesenc		10*16(%rdi), %xmm0, %xmm0
++	vaesenc		11*16(%rdi), %xmm0, %xmm0
++	vaesenc		12*16(%rdi), %xmm0, %xmm0
++	vaesenc		13*16(%rdi), %xmm0, %xmm0
++	vaesenclast	14*16(%rdi), %xmm0, %xmm0
++.Lencrypt_iv_done:
++	vmovdqu		%xmm0, (%rsi)
++	RET
++
++	// Out-of-line handling of AES-128 and AES-192
++.Lencrypt_iv_aes_128_or_192:
++	jz		.Lencrypt_iv_aes_192
++	vaesenclast	10*16(%rdi), %xmm0, %xmm0
++	jmp		.Lencrypt_iv_done
++.Lencrypt_iv_aes_192:
++	vaesenc		10*16(%rdi), %xmm0, %xmm0
++	vaesenc		11*16(%rdi), %xmm0, %xmm0
++	vaesenclast	12*16(%rdi), %xmm0, %xmm0
++	jmp		.Lencrypt_iv_done
++SYM_FUNC_END(aes_xts_encrypt_iv)
++
++// Below are the actual AES-XTS encryption and decryption functions,
++// instantiated from the above macro.  They all have the following prototype:
++//
++// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
++//			const u8 *src, u8 *dst, size_t len,
++//			u8 tweak[AES_BLOCK_SIZE]);
++//
++// |key| is the data key.  |tweak| contains the next tweak; the encryption of
++// the original IV with the tweak key was already done.  This function supports
++// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
++// |len| must be a multiple of 16 except on the last call.  If |len| is a
++// multiple of 16, then this function updates |tweak| to contain the next tweak.
++
++.set	VL, 16
++.set	USE_AVX10, 0
++SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
++	_aes_xts_crypt	1
++SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
++SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx)
++	_aes_xts_crypt	0
++SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
++
++#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
++.set	VL, 32
++.set	USE_AVX10, 0
++SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
++	_aes_xts_crypt	1
++SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
++SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
++	_aes_xts_crypt	0
++SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
++
++.set	VL, 32
++.set	USE_AVX10, 1
++SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
++	_aes_xts_crypt	1
++SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
++SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
++	_aes_xts_crypt	0
++SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
++
++.set	VL, 64
++.set	USE_AVX10, 1
++SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
++	_aes_xts_crypt	1
++SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
++SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
++	_aes_xts_crypt	0
++SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
++#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
+diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
+index b1d90c25975a..0855ace8659c 100644
+--- a/arch/x86/crypto/aesni-intel_glue.c
++++ b/arch/x86/crypto/aesni-intel_glue.c
+@@ -1137,7 +1137,264 @@ static struct skcipher_alg aesni_xctr = {
+ };
+ 
+ static struct simd_skcipher_alg *aesni_simd_xctr;
+-#endif /* CONFIG_X86_64 */
++
++asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
++				   u8 iv[AES_BLOCK_SIZE]);
++
++typedef void (*xts_asm_func)(const struct crypto_aes_ctx *key,
++			     const u8 *src, u8 *dst, size_t len,
++			     u8 tweak[AES_BLOCK_SIZE]);
++
++/* This handles cases where the source and/or destination span pages. */
++static noinline int
++xts_crypt_slowpath(struct skcipher_request *req, xts_asm_func asm_func)
++{
++	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
++	const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
++	int tail = req->cryptlen % AES_BLOCK_SIZE;
++	struct scatterlist sg_src[2], sg_dst[2];
++	struct skcipher_request subreq;
++	struct skcipher_walk walk;
++	struct scatterlist *src, *dst;
++	int err;
++
++	/*
++	 * If the message length isn't divisible by the AES block size, then
++	 * separate off the last full block and the partial block.  This ensures
++	 * that they are processed in the same call to the assembly function,
++	 * which is required for ciphertext stealing.
++	 */
++	if (tail) {
++		skcipher_request_set_tfm(&subreq, tfm);
++		skcipher_request_set_callback(&subreq,
++					      skcipher_request_flags(req),
++					      NULL, NULL);
++		skcipher_request_set_crypt(&subreq, req->src, req->dst,
++					   req->cryptlen - tail - AES_BLOCK_SIZE,
++					   req->iv);
++		req = &subreq;
++	}
++
++	err = skcipher_walk_virt(&walk, req, false);
++
++	while (walk.nbytes) {
++		unsigned int nbytes = walk.nbytes;
++
++		if (nbytes < walk.total)
++			nbytes = round_down(nbytes, AES_BLOCK_SIZE);
++
++		kernel_fpu_begin();
++		(*asm_func)(&ctx->crypt_ctx, walk.src.virt.addr,
++			    walk.dst.virt.addr, nbytes, req->iv);
++		kernel_fpu_end();
++		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
++	}
++
++	if (err || !tail)
++		return err;
++
++	/* Do ciphertext stealing with the last full block and partial block. */
++
++	dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
++	if (req->dst != req->src)
++		dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
++
++	skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
++				   req->iv);
++
++	err = skcipher_walk_virt(&walk, req, false);
++	if (err)
++		return err;
++
++	kernel_fpu_begin();
++	(*asm_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr,
++		    walk.nbytes, req->iv);
++	kernel_fpu_end();
++
++	return skcipher_walk_done(&walk, 0);
++}
++
++/* __always_inline to avoid indirect call in fastpath */
++static __always_inline int
++xts_crypt2(struct skcipher_request *req, xts_asm_func asm_func)
++{
++	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
++	const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm);
++	const unsigned int cryptlen = req->cryptlen;
++	struct scatterlist *src = req->src;
++	struct scatterlist *dst = req->dst;
++
++	if (unlikely(cryptlen < AES_BLOCK_SIZE))
++		return -EINVAL;
++
++	kernel_fpu_begin();
++	aes_xts_encrypt_iv(&ctx->tweak_ctx, req->iv);
++
++	/*
++	 * In practice, virtually all XTS plaintexts and ciphertexts are either
++	 * 512 or 4096 bytes, aligned such that they don't span page boundaries.
++	 * To optimize the performance of these cases, and also any other case
++	 * where no page boundary is spanned, the below fast-path handles
++	 * single-page sources and destinations as efficiently as possible.
++	 */
++	if (likely(src->length >= cryptlen && dst->length >= cryptlen &&
++		   src->offset + cryptlen <= PAGE_SIZE &&
++		   dst->offset + cryptlen <= PAGE_SIZE)) {
++		struct page *src_page = sg_page(src);
++		struct page *dst_page = sg_page(dst);
++		void *src_virt = kmap_local_page(src_page) + src->offset;
++		void *dst_virt = kmap_local_page(dst_page) + dst->offset;
++
++		(*asm_func)(&ctx->crypt_ctx, src_virt, dst_virt, cryptlen,
++			    req->iv);
++		kunmap_local(dst_virt);
++		kunmap_local(src_virt);
++		kernel_fpu_end();
++		return 0;
++	}
++	kernel_fpu_end();
++	return xts_crypt_slowpath(req, asm_func);
++}
++
++#define DEFINE_XTS_ALG(suffix, driver_name, priority)			       \
++									       \
++asmlinkage void aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key,     \
++					 const u8 *src, u8 *dst, size_t len,   \
++					 u8 tweak[AES_BLOCK_SIZE]);	       \
++asmlinkage void aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key,     \
++					 const u8 *src, u8 *dst, size_t len,   \
++					 u8 tweak[AES_BLOCK_SIZE]);	       \
++									       \
++static int xts_encrypt_##suffix(struct skcipher_request *req)		       \
++{									       \
++	return xts_crypt2(req, aes_xts_encrypt_##suffix);		       \
++}									       \
++									       \
++static int xts_decrypt_##suffix(struct skcipher_request *req)		       \
++{									       \
++	return xts_crypt2(req, aes_xts_decrypt_##suffix);		       \
++}									       \
++									       \
++static struct skcipher_alg aes_xts_alg_##suffix = {			       \
++	.base = {							       \
++		.cra_name		= "__xts(aes)",			       \
++		.cra_driver_name	= "__" driver_name,		       \
++		.cra_priority		= priority,			       \
++		.cra_flags		= CRYPTO_ALG_INTERNAL,		       \
++		.cra_blocksize		= AES_BLOCK_SIZE,		       \
++		.cra_ctxsize		= XTS_AES_CTX_SIZE,		       \
++		.cra_module		= THIS_MODULE,			       \
++	},								       \
++	.min_keysize	= 2 * AES_MIN_KEY_SIZE,				       \
++	.max_keysize	= 2 * AES_MAX_KEY_SIZE,				       \
++	.ivsize		= AES_BLOCK_SIZE,				       \
++	.walksize	= 2 * AES_BLOCK_SIZE,				       \
++	.setkey		= xts_aesni_setkey,				       \
++	.encrypt	= xts_encrypt_##suffix,				       \
++	.decrypt	= xts_decrypt_##suffix,				       \
++};									       \
++									       \
++static struct simd_skcipher_alg *aes_xts_simdalg_##suffix
++
++DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500);
++#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
++DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600);
++DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700);
++DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800);
++#endif
++
++/*
++ * This is a list of CPU models that are known to suffer from downclocking when
++ * zmm registers (512-bit vectors) are used.  On these CPUs, the AES-XTS
++ * implementation with zmm registers won't be used by default.  An
++ * implementation with ymm registers (256-bit vectors) will be used instead.
++ */
++static const struct x86_cpu_id zmm_exclusion_list[] = {
++	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_SKYLAKE_X },
++	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_X },
++	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_D },
++	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE },
++	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_L },
++	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_NNPI },
++	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_TIGERLAKE_L },
++	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_TIGERLAKE },
++	/* Allow Rocket Lake and later, and Sapphire Rapids and later. */
++	/* Also allow AMD CPUs (starting with Zen 4, the first with AVX-512). */
++	{},
++};
++
++static int __init register_xts_algs(void)
++{
++	int err;
++
++	if (!boot_cpu_has(X86_FEATURE_AVX))
++		return 0;
++	err = simd_register_skciphers_compat(&aes_xts_alg_aesni_avx, 1,
++					     &aes_xts_simdalg_aesni_avx);
++	if (err)
++		return err;
++#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
++	if (!boot_cpu_has(X86_FEATURE_AVX2) ||
++	    !boot_cpu_has(X86_FEATURE_VAES) ||
++	    !boot_cpu_has(X86_FEATURE_VPCLMULQDQ) ||
++	    !boot_cpu_has(X86_FEATURE_PCLMULQDQ) ||
++	    !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
++		return 0;
++	err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx2, 1,
++					     &aes_xts_simdalg_vaes_avx2);
++	if (err)
++		return err;
++
++	if (!boot_cpu_has(X86_FEATURE_AVX512BW) ||
++	    !boot_cpu_has(X86_FEATURE_AVX512VL) ||
++	    !boot_cpu_has(X86_FEATURE_BMI2) ||
++	    !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
++			       XFEATURE_MASK_AVX512, NULL))
++		return 0;
++
++	err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_256, 1,
++					     &aes_xts_simdalg_vaes_avx10_256);
++	if (err)
++		return err;
++
++	if (x86_match_cpu(zmm_exclusion_list))
++		aes_xts_alg_vaes_avx10_512.base.cra_priority = 1;
++
++	err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1,
++					     &aes_xts_simdalg_vaes_avx10_512);
++	if (err)
++		return err;
++#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
++	return 0;
++}
++
++static void unregister_xts_algs(void)
++{
++	if (aes_xts_simdalg_aesni_avx)
++		simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1,
++					  &aes_xts_simdalg_aesni_avx);
++#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
++	if (aes_xts_simdalg_vaes_avx2)
++		simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1,
++					  &aes_xts_simdalg_vaes_avx2);
++	if (aes_xts_simdalg_vaes_avx10_256)
++		simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1,
++					  &aes_xts_simdalg_vaes_avx10_256);
++	if (aes_xts_simdalg_vaes_avx10_512)
++		simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1,
++					  &aes_xts_simdalg_vaes_avx10_512);
++#endif
++}
++#else /* CONFIG_X86_64 */
++static int __init register_xts_algs(void)
++{
++	return 0;
++}
++
++static void unregister_xts_algs(void)
++{
++}
++#endif /* !CONFIG_X86_64 */
+ 
+ #ifdef CONFIG_X86_64
+ static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
+@@ -1276,13 +1533,21 @@ static int __init aesni_init(void)
+ 		goto unregister_aeads;
+ #endif /* CONFIG_X86_64 */
+ 
++	err = register_xts_algs();
++	if (err)
++		goto unregister_xts;
++
+ 	return 0;
+ 
++unregister_xts:
++	unregister_xts_algs();
+ #ifdef CONFIG_X86_64
++	if (aesni_simd_xctr)
++		simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
+ unregister_aeads:
++#endif /* CONFIG_X86_64 */
+ 	simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
+ 				aesni_simd_aeads);
+-#endif /* CONFIG_X86_64 */
+ 
+ unregister_skciphers:
+ 	simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
+@@ -1303,6 +1568,7 @@ static void __exit aesni_exit(void)
+ 	if (boot_cpu_has(X86_FEATURE_AVX))
+ 		simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
+ #endif /* CONFIG_X86_64 */
++	unregister_xts_algs();
+ }
+ 
+ late_initcall(aesni_init);
+-- 
+2.44.0
+
+From 4a47b09deb67c3854ac102bcb18ef0df00aae437 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Wed, 3 Apr 2024 17:06:20 +0200
+Subject: [PATCH 2/8] amd-pstate
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -9,19 +1206,18 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  Documentation/admin-guide/pm/amd-pstate.rst   |  70 ++-
  arch/x86/Kconfig                              |   5 +-
  arch/x86/include/asm/msr-index.h              |   2 +
- arch/x86/kernel/acpi/cppc.c                   |   2 +-
  drivers/acpi/cppc_acpi.c                      |  17 +-
  drivers/acpi/processor_driver.c               |   6 +
  drivers/cpufreq/acpi-cpufreq.c                |   2 -
  drivers/cpufreq/amd-pstate-ut.c               |   2 +-
- drivers/cpufreq/amd-pstate.c                  | 501 +++++++++++++++---
+ drivers/cpufreq/amd-pstate.c                  | 499 +++++++++++++++---
  include/acpi/cppc_acpi.h                      |   5 +
  include/linux/amd-pstate.h                    |  32 +-
  include/linux/cpufreq.h                       |   1 +
- 13 files changed, 562 insertions(+), 88 deletions(-)
+ 12 files changed, 560 insertions(+), 86 deletions(-)
 
 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
-index 73062d47a462..a493d93e0d2c 100644
+index d2150bd3acc5..71ed7f1b0f9b 100644
 --- a/Documentation/admin-guide/kernel-parameters.txt
 +++ b/Documentation/admin-guide/kernel-parameters.txt
 @@ -374,6 +374,11 @@
@@ -132,7 +1328,7 @@ index 9eb26014d34b..82fbd01da658 100644
  ===============================================
  
 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
-index 637e337c332e..de39c296ea3f 100644
+index 184730705650..70732a76171f 100644
 --- a/arch/x86/Kconfig
 +++ b/arch/x86/Kconfig
 @@ -1054,8 +1054,9 @@ config SCHED_MC
@@ -160,19 +1356,6 @@ index d1b5edaf6c34..bfe139eb75b6 100644
  
  /* K6 MSRs */
  #define MSR_K6_WHCR			0xc0000082
-diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
-index 8d8752b44f11..ff8f25faca3d 100644
---- a/arch/x86/kernel/acpi/cppc.c
-+++ b/arch/x86/kernel/acpi/cppc.c
-@@ -20,7 +20,7 @@ bool cpc_supported_by_cpu(void)
- 		    (boot_cpu_data.x86_model >= 0x20 && boot_cpu_data.x86_model <= 0x2f)))
- 			return true;
- 		else if (boot_cpu_data.x86 == 0x17 &&
--			 boot_cpu_data.x86_model >= 0x70 && boot_cpu_data.x86_model <= 0x7f)
-+			 boot_cpu_data.x86_model >= 0x30 && boot_cpu_data.x86_model <= 0x7f)
- 			return true;
- 		return boot_cpu_has(X86_FEATURE_CPPC);
- 	}
 diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
 index d155a86a8614..e23a84f4a50a 100644
 --- a/drivers/acpi/cppc_acpi.c
@@ -260,7 +1443,7 @@ index f04ae67dda37..b3601b0e6dd3 100644
  					(policy->max == cpudata->nominal_freq))
  				amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS;
 diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
-index 1791d37fbc53..651055df1710 100644
+index 07f341995439..651055df1710 100644
 --- a/drivers/cpufreq/amd-pstate.c
 +++ b/drivers/cpufreq/amd-pstate.c
 @@ -37,6 +37,7 @@
@@ -455,7 +1638,7 @@ index 1791d37fbc53..651055df1710 100644
  
  	if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq)
  		amd_pstate_update_min_max_limit(policy);
-@@ -564,13 +635,12 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
+@@ -564,7 +635,6 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
  
  	cap_perf = READ_ONCE(cpudata->highest_perf);
  	lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf);
@@ -463,13 +1646,6 @@ index 1791d37fbc53..651055df1710 100644
  
  	des_perf = cap_perf;
  	if (target_perf < capacity)
- 		des_perf = DIV_ROUND_UP(cap_perf * target_perf, capacity);
- 
--	min_perf = READ_ONCE(cpudata->highest_perf);
-+	min_perf = READ_ONCE(cpudata->lowest_perf);
- 	if (_min_perf < capacity)
- 		min_perf = DIV_ROUND_UP(cap_perf * _min_perf, capacity);
- 
 @@ -582,8 +652,6 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
  		max_perf = min_perf;
  
@@ -629,7 +1805,8 @@ index 1791d37fbc53..651055df1710 100644
 -	nominal_perf = READ_ONCE(cpudata->nominal_perf);
 +	if (boot_cpu_has(X86_FEATURE_CPPC)) {
 +		u64 cap1;
-+
+ 
+-	if (highest_perf <= nominal_perf)
 +		ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1);
 +		if (ret)
 +			return ret;
@@ -652,12 +1829,13 @@ index 1791d37fbc53..651055df1710 100644
 +{
 +	int ret, prio;
 +	u32 highest_perf;
- 
--	if (highest_perf <= nominal_perf)
++
 +	ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf);
 +	if (ret)
-+		return;
-+
+ 		return;
+ 
+-	cpudata->boost_supported = true;
+-	current_pstate_driver->boost_enabled = true;
 +	cpudata->hw_prefcore = true;
 +	/* check if CPPC preferred core feature is enabled*/
 +	if (highest_perf < CPPC_MAX_PERF)
@@ -669,10 +1847,8 @@ index 1791d37fbc53..651055df1710 100644
 +	}
 +
 +	if (!amd_pstate_prefcore)
- 		return;
- 
--	cpudata->boost_supported = true;
--	current_pstate_driver->boost_enabled = true;
++		return;
++
 +	/*
 +	 * The priorities can be set regardless of whether or not
 +	 * sched_set_itmt_support(true) has been called and it is valid to
@@ -1193,7 +2369,7 @@ index 6ad02ad9c7b4..e89cf1249715 100644
 +
  #endif /* _LINUX_AMD_PSTATE_H */
 diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
-index afda5f24d3dd..9bebeec24abb 100644
+index 320fab7d2e94..3129411fa978 100644
 --- a/include/linux/cpufreq.h
 +++ b/include/linux/cpufreq.h
 @@ -263,6 +263,7 @@ static inline bool cpufreq_supports_freq_invariance(void)
@@ -1207,10 +2383,10 @@ index afda5f24d3dd..9bebeec24abb 100644
 -- 
 2.44.0
 
-From 93aefd5f98b793e9447e64dcbaa69221102e304a Mon Sep 17 00:00:00 2001
+From 7f2e4860d7405f71337e99ea74b84ebcd2c3b90c Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 26 Feb 2024 15:46:58 +0100
-Subject: [PATCH 2/7] bbr3
+Date: Wed, 3 Apr 2024 17:06:31 +0200
+Subject: [PATCH 3/8] bbr3
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -1564,7 +2740,7 @@ index ae8b15e6896f..beb040e80b6f 100644
  	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
  	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
-index c82dc42f57c6..1bc25bc01a8d 100644
+index a4f418592314..58469fe5195e 100644
 --- a/net/ipv4/tcp.c
 +++ b/net/ipv4/tcp.c
 @@ -3089,6 +3089,7 @@ int tcp_disconnect(struct sock *sk, int flags)
@@ -4375,10 +5551,10 @@ index df7b13f0e5e0..8415aa41524e 100644
  	    tcp_in_quickack_mode(sk) ||
  	    /* Protocol state mandates a one-time immediate ACK */
 diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
-index 9e85f2a0bddd..914a75bb0734 100644
+index 0ecc7311dc6c..82622782486a 100644
 --- a/net/ipv4/tcp_minisocks.c
 +++ b/net/ipv4/tcp_minisocks.c
-@@ -464,6 +464,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
+@@ -460,6 +460,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
  	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
  	bool ca_got_dst = false;
  
@@ -4593,10 +5769,10 @@ index d1ad20ce1c8c..ef74f33c7905 100644
 -- 
 2.44.0
 
-From fb681aa9768aa30b3b17152a221868238394dd64 Mon Sep 17 00:00:00 2001
+From 71b4361aff469d7e31d2260c0f689a976a1a89d0 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 26 Feb 2024 15:47:11 +0100
-Subject: [PATCH 3/7] block
+Date: Wed, 3 Apr 2024 17:06:41 +0200
+Subject: [PATCH 4/8] block
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -4864,7 +6040,7 @@ index 467e8cfc41a2..f44f5d4ec2f4 100644
  	 * bic associated with the task issuing current bio for
  	 * merging. This and the next field are used as a support to
 diff --git a/block/mq-deadline.c b/block/mq-deadline.c
-index f958e79277b8..1b0de4fc3958 100644
+index 02a916ba62ee..8bf621316a9e 100644
 --- a/block/mq-deadline.c
 +++ b/block/mq-deadline.c
 @@ -79,10 +79,24 @@ struct dd_per_prio {
@@ -4986,7 +6162,7 @@ index f958e79277b8..1b0de4fc3958 100644
  	return rq;
  }
  
-@@ -706,6 +764,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
+@@ -705,6 +763,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
  
  	eq->elevator_data = dd;
  
@@ -5000,7 +6176,7 @@ index f958e79277b8..1b0de4fc3958 100644
  	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
  		struct dd_per_prio *per_prio = &dd->per_prio[prio];
  
-@@ -722,8 +787,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
+@@ -721,8 +786,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
  	dd->last_dir = DD_WRITE;
  	dd->fifo_batch = fifo_batch;
  	dd->prio_aging_expire = prio_aging_expire;
@@ -5009,7 +6185,7 @@ index f958e79277b8..1b0de4fc3958 100644
  
  	/* We dispatch from request queue wide instead of hw queue */
  	blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
-@@ -779,7 +842,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
+@@ -778,7 +841,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
  	struct request *free = NULL;
  	bool ret;
  
@@ -5030,7 +6206,7 @@ index f958e79277b8..1b0de4fc3958 100644
  	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
  	spin_unlock(&dd->lock);
  
-@@ -792,10 +867,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
+@@ -791,10 +866,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
  /*
   * add rq to rbtree and fifo
   */
@@ -5042,7 +6218,7 @@ index f958e79277b8..1b0de4fc3958 100644
  	struct deadline_data *dd = q->elevator->elevator_data;
  	const enum dd_data_dir data_dir = rq_data_dir(rq);
  	u16 ioprio = req_get_ioprio(rq);
-@@ -867,19 +941,13 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
+@@ -866,19 +940,13 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
  {
  	struct request_queue *q = hctx->queue;
  	struct deadline_data *dd = q->elevator->elevator_data;
@@ -5068,7 +6244,7 @@ index f958e79277b8..1b0de4fc3958 100644
  }
  
  /* Callback from inside blk_mq_rq_ctx_init(). */
-@@ -958,6 +1026,10 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
+@@ -957,6 +1025,10 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
  	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
  	enum dd_prio prio;
  
@@ -5082,14 +6258,15 @@ index f958e79277b8..1b0de4fc3958 100644
 -- 
 2.44.0
 
-From 4f371ea8a1f8a47e624592a91f9e961080aec2eb Mon Sep 17 00:00:00 2001
+From b667355ece89a997a7b8508e6d6f1b5be46d3833 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 26 Feb 2024 15:47:21 +0100
-Subject: [PATCH 4/7] cachy
+Date: Wed, 3 Apr 2024 17:06:52 +0200
+Subject: [PATCH 5/8] cachy
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
  .../admin-guide/kernel-parameters.txt         |   9 +
+ Documentation/admin-guide/sysctl/vm.rst       |  72 ++
  Makefile                                      | 162 ++++-
  arch/arm/Makefile                             |  56 +-
  arch/x86/Kconfig.cpu                          | 426 +++++++++++-
@@ -5102,6 +6279,15 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  block/elevator.c                              |  10 +
  drivers/ata/ahci.c                            |  23 +-
  drivers/cpufreq/Kconfig.x86                   |   2 -
+ drivers/gpu/drm/amd/amdgpu/amdgpu.h           |   1 +
+ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       |  10 +
+ drivers/gpu/drm/amd/display/Kconfig           |   6 +
+ .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |   2 +-
+ .../amd/display/amdgpu_dm/amdgpu_dm_color.c   |   2 +-
+ .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c    |   6 +-
+ .../amd/display/amdgpu_dm/amdgpu_dm_plane.c   |   6 +-
+ drivers/gpu/drm/amd/pm/amdgpu_pm.c            |   3 +
+ drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     |  14 +-
  drivers/i2c/busses/Kconfig                    |   9 +
  drivers/i2c/busses/Makefile                   |   1 +
  drivers/i2c/busses/i2c-nct6775.c              | 648 ++++++++++++++++++
@@ -5114,7 +6300,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  drivers/platform/x86/Kconfig                  |  14 +
  drivers/platform/x86/Makefile                 |   3 +
  drivers/platform/x86/steamdeck.c              | 523 ++++++++++++++
- include/linux/mm.h                            |   2 +-
+ include/linux/mm.h                            |  10 +-
  include/linux/pagemap.h                       |   2 +-
  include/linux/user_namespace.h                |   4 +
  init/Kconfig                                  |  26 +
@@ -5122,26 +6308,27 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  kernel/fork.c                                 |  14 +
  kernel/sched/fair.c                           |  13 +
  kernel/sched/sched.h                          |   2 +-
- kernel/sysctl.c                               |  12 +
+ kernel/sysctl.c                               |  46 ++
  kernel/user_namespace.c                       |   7 +
- mm/Kconfig                                    |   2 +-
+ mm/Kconfig                                    |  65 +-
  mm/compaction.c                               |   4 +
  mm/huge_memory.c                              |   4 +
+ mm/mm_init.c                                  |   1 +
  mm/page-writeback.c                           |   8 +
  mm/page_alloc.c                               |  27 +-
  mm/swap.c                                     |   5 +
  mm/vmpressure.c                               |   4 +
- mm/vmscan.c                                   |   8 +
- 43 files changed, 2639 insertions(+), 165 deletions(-)
+ mm/vmscan.c                                   | 178 ++++-
+ 54 files changed, 3020 insertions(+), 182 deletions(-)
  create mode 100644 drivers/i2c/busses/i2c-nct6775.c
  create mode 100644 drivers/pci/controller/intel-nvme-remap.c
  create mode 100644 drivers/platform/x86/steamdeck.c
 
 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
-index a493d93e0d2c..8d6a2ce37f8f 100644
+index 71ed7f1b0f9b..fbfaea49cbed 100644
 --- a/Documentation/admin-guide/kernel-parameters.txt
 +++ b/Documentation/admin-guide/kernel-parameters.txt
-@@ -4396,6 +4396,15 @@
+@@ -4394,6 +4394,15 @@
  		nomsi		[MSI] If the PCI_MSI kernel config parameter is
  				enabled, this kernel boot option can be used to
  				disable the use of MSI interrupts system-wide.
@@ -5157,8 +6344,105 @@ index a493d93e0d2c..8d6a2ce37f8f 100644
  		noioapicquirk	[APIC] Disable all boot interrupt quirks.
  				Safety option to keep boot IRQs enabled. This
  				should never be necessary.
+diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
+index c59889de122b..468ae7dec1e1 100644
+--- a/Documentation/admin-guide/sysctl/vm.rst
++++ b/Documentation/admin-guide/sysctl/vm.rst
+@@ -25,6 +25,9 @@ files can be found in mm/swap.c.
+ Currently, these files are in /proc/sys/vm:
+ 
+ - admin_reserve_kbytes
++- anon_min_ratio
++- clean_low_ratio
++- clean_min_ratio
+ - compact_memory
+ - compaction_proactiveness
+ - compact_unevictable_allowed
+@@ -106,6 +109,67 @@ On x86_64 this is about 128MB.
+ Changing this takes effect whenever an application requests memory.
+ 
+ 
++anon_min_ratio
++==============
++
++This knob provides *hard* protection of anonymous pages. The anonymous pages
++on the current node won't be reclaimed under any conditions when their amount
++is below vm.anon_min_ratio.
++
++This knob may be used to prevent excessive swap thrashing when anonymous
++memory is low (for example, when memory is going to be overfilled by
++compressed data of zram module).
++
++Setting this value too high (close to 100) can result in inability to
++swap and can lead to early OOM under memory pressure.
++
++The unit of measurement is the percentage of the total memory of the node.
++
++The default value is 15.
++
++
++clean_low_ratio
++================
++
++This knob provides *best-effort* protection of clean file pages. The file pages
++on the current node won't be reclaimed under memory pressure when the amount of
++clean file pages is below vm.clean_low_ratio *unless* we threaten to OOM.
++
++Protection of clean file pages using this knob may be used when swapping is
++still possible to
++  - prevent disk I/O thrashing under memory pressure;
++  - improve performance in disk cache-bound tasks under memory pressure.
++
++Setting it to a high value may result in a early eviction of anonymous pages
++into the swap space by attempting to hold the protected amount of clean file
++pages in memory.
++
++The unit of measurement is the percentage of the total memory of the node.
++
++The default value is 0.
++
++
++clean_min_ratio
++================
++
++This knob provides *hard* protection of clean file pages. The file pages on the
++current node won't be reclaimed under memory pressure when the amount of clean
++file pages is below vm.clean_min_ratio.
++
++Hard protection of clean file pages using this knob may be used to
++  - prevent disk I/O thrashing under memory pressure even with no free swap space;
++  - improve performance in disk cache-bound tasks under memory pressure;
++  - avoid high latency and prevent livelock in near-OOM conditions.
++
++Setting it to a high value may result in a early out-of-memory condition due to
++the inability to reclaim the protected amount of clean file pages when other
++types of pages cannot be reclaimed.
++
++The unit of measurement is the percentage of the total memory of the node.
++
++The default value is 15.
++
++
+ compact_memory
+ ==============
+ 
+@@ -910,6 +974,14 @@ be 133 (x + 2x = 200, 2x = 133.33).
+ At 0, the kernel will not initiate swap until the amount of free and
+ file-backed pages is less than the high watermark in a zone.
+ 
++This knob has no effect if the amount of clean file pages on the current
++node is below vm.clean_low_ratio or vm.clean_min_ratio. In this case,
++only anonymous pages can be reclaimed.
++
++If the number of anonymous pages on the current node is below
++vm.anon_min_ratio, then only file pages can be reclaimed with
++any vm.swappiness value.
++
+ 
+ unprivileged_userfaultfd
+ ========================
 diff --git a/Makefile b/Makefile
-index 95b320ada47c..0b7d42037c3e 100644
+index a78379891d22..e58a4e647e7d 100644
 --- a/Makefile
 +++ b/Makefile
 @@ -808,9 +808,164 @@ endif # need-config
@@ -6230,10 +7514,10 @@ index 5ff093cb3cf8..1c93fe91b006 100644
  
  /*
 diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
-index 682ff550ccfb..67f17fd94144 100644
+index df3fd6474bf2..4303eb5fe11b 100644
 --- a/drivers/ata/ahci.c
 +++ b/drivers/ata/ahci.c
-@@ -1560,7 +1560,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
+@@ -1547,7 +1547,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
  }
  #endif
  
@@ -6242,7 +7526,7 @@ index 682ff550ccfb..67f17fd94144 100644
  		struct ahci_host_priv *hpriv)
  {
  	int i;
-@@ -1573,7 +1573,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
+@@ -1560,7 +1560,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
  	    pci_resource_len(pdev, bar) < SZ_512K ||
  	    bar != AHCI_PCI_BAR_STANDARD ||
  	    !(readl(hpriv->mmio + AHCI_VSCAP) & 1))
@@ -6251,7 +7535,7 @@ index 682ff550ccfb..67f17fd94144 100644
  
  	cap = readq(hpriv->mmio + AHCI_REMAP_CAP);
  	for (i = 0; i < AHCI_MAX_REMAP; i++) {
-@@ -1588,18 +1588,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
+@@ -1575,18 +1575,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
  	}
  
  	if (!hpriv->remapped_nvme)
@@ -6274,7 +7558,7 @@ index 682ff550ccfb..67f17fd94144 100644
  }
  
  static int ahci_get_irq_vector(struct ata_host *host, int port)
-@@ -1819,7 +1812,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+@@ -1806,7 +1799,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
  	hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar];
  
  	/* detect remapped nvme devices */
@@ -6305,6 +7589,195 @@ index 438c9e75a04d..1bbfeca5f01e 100644
  	help
  	  This driver adds a CPUFreq driver which utilizes a fine grain
  	  processor performance frequency control range instead of legacy
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+index 79827a6dcd7f..ee85a2352771 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+@@ -153,6 +153,7 @@ struct amdgpu_watchdog_timer
+  */
+ extern int amdgpu_modeset;
+ extern unsigned int amdgpu_vram_limit;
++extern int amdgpu_ignore_min_pcap;
+ extern int amdgpu_vis_vram_limit;
+ extern int amdgpu_gart_size;
+ extern int amdgpu_gtt_size;
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+index 586f4d03039d..a2524615b696 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+@@ -132,6 +132,7 @@ enum AMDGPU_DEBUG_MASK {
+ };
+ 
+ unsigned int amdgpu_vram_limit = UINT_MAX;
++int amdgpu_ignore_min_pcap = 0; /* do not ignore by default */
+ int amdgpu_vis_vram_limit;
+ int amdgpu_gart_size = -1; /* auto */
+ int amdgpu_gtt_size = -1; /* auto */
+@@ -241,6 +242,15 @@ struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
+ 	.period = 0x0, /* default to 0x0 (timeout disable) */
+ };
+ 
++/**
++ * DOC: ignore_min_pcap (int)
++ * Ignore the minimum power cap.
++ * Useful on graphics cards where the minimum power cap is very high.
++ * The default is 0 (Do not ignore).
++ */
++MODULE_PARM_DESC(ignore_min_pcap, "Ignore the minimum power cap");
++module_param_named(ignore_min_pcap, amdgpu_ignore_min_pcap, int, 0600);
++
+ /**
+  * DOC: vramlimit (int)
+  * Restrict the total amount of VRAM in MiB for testing.  The default is 0 (Use full VRAM).
+diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig
+index 901d1961b739..05c49141f580 100644
+--- a/drivers/gpu/drm/amd/display/Kconfig
++++ b/drivers/gpu/drm/amd/display/Kconfig
+@@ -51,4 +51,10 @@ config DRM_AMD_SECURE_DISPLAY
+ 	  This option enables the calculation of crc of specific region via
+ 	  debugfs. Cooperate with specific DMCU FW.
+ 
++config AMD_PRIVATE_COLOR
++	bool "Enable KMS color management by AMD for AMD"
++	default n
++	help
++	  This option extends the KMS color management API with AMD driver-specific properties to enhance the color management support on AMD Steam Deck.
++
+ endmenu
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+index 1eb0f82e9dfa..5e0c551759ab 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+@@ -4072,7 +4072,7 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev)
+ 		return r;
+ 	}
+ 
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ 	if (amdgpu_dm_create_color_properties(adev))
+ 		return -ENOMEM;
+ #endif
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
+index c87b64e464ed..6fe07243adc3 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
+@@ -97,7 +97,7 @@ static inline struct fixed31_32 amdgpu_dm_fixpt_from_s3132(__u64 x)
+ 	return val;
+ }
+ 
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ /* Pre-defined Transfer Functions (TF)
+  *
+  * AMD driver supports pre-defined mathematical functions for transferring
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
+index 6e715ef3a556..11c7199ec3b3 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
+@@ -290,7 +290,7 @@ static int amdgpu_dm_crtc_late_register(struct drm_crtc *crtc)
+ }
+ #endif
+ 
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ /**
+  * dm_crtc_additional_color_mgmt - enable additional color properties
+  * @crtc: DRM CRTC
+@@ -372,7 +372,7 @@ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = {
+ #if defined(CONFIG_DEBUG_FS)
+ 	.late_register = amdgpu_dm_crtc_late_register,
+ #endif
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ 	.atomic_set_property = amdgpu_dm_atomic_crtc_set_property,
+ 	.atomic_get_property = amdgpu_dm_atomic_crtc_get_property,
+ #endif
+@@ -551,7 +551,7 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm,
+ 
+ 	drm_mode_crtc_set_gamma_size(&acrtc->base, MAX_COLOR_LEGACY_LUT_ENTRIES);
+ 
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ 	dm_crtc_additional_color_mgmt(&acrtc->base);
+ #endif
+ 	return 0;
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
+index 8a4c40b4c27e..779880c64575 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
+@@ -1468,7 +1468,7 @@ static void amdgpu_dm_plane_drm_plane_destroy_state(struct drm_plane *plane,
+ 	drm_atomic_helper_plane_destroy_state(plane, state);
+ }
+ 
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ static void
+ dm_atomic_plane_attach_color_mgmt_properties(struct amdgpu_display_manager *dm,
+ 					     struct drm_plane *plane)
+@@ -1659,7 +1659,7 @@ static const struct drm_plane_funcs dm_plane_funcs = {
+ 	.atomic_duplicate_state = amdgpu_dm_plane_drm_plane_duplicate_state,
+ 	.atomic_destroy_state = amdgpu_dm_plane_drm_plane_destroy_state,
+ 	.format_mod_supported = amdgpu_dm_plane_format_mod_supported,
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ 	.atomic_set_property = dm_atomic_plane_set_property,
+ 	.atomic_get_property = dm_atomic_plane_get_property,
+ #endif
+@@ -1742,7 +1742,7 @@ int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm,
+ 
+ 	drm_plane_helper_add(plane, &dm_plane_helper_funcs);
+ 
+-#ifdef AMD_PRIVATE_COLOR
++#ifdef CONFIG_AMD_PRIVATE_COLOR
+ 	dm_atomic_plane_attach_color_mgmt_properties(dm, plane);
+ #endif
+ 	/* Create (reset) the plane state */
+diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+index 39c5e1dfa275..ee97bb26a8ef 100644
+--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
++++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+@@ -3034,6 +3034,9 @@ static ssize_t amdgpu_hwmon_show_power_cap_min(struct device *dev,
+ 					 struct device_attribute *attr,
+ 					 char *buf)
+ {
++	if (amdgpu_ignore_min_pcap)
++		return sysfs_emit(buf, "%i\n", 0);
++
+ 	return amdgpu_hwmon_show_power_cap_generic(dev, attr, buf, PP_PWR_LIMIT_MIN);
+ }
+ 
+diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+index 0ad947df777a..7b82e3ef7c91 100644
+--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
++++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+@@ -2695,7 +2695,10 @@ int smu_get_power_limit(void *handle,
+ 			*limit = smu->max_power_limit;
+ 			break;
+ 		case SMU_PPT_LIMIT_MIN:
+-			*limit = smu->min_power_limit;
++			if (amdgpu_ignore_min_pcap)
++				*limit = 0;
++			else
++				*limit = smu->min_power_limit;
+ 			break;
+ 		default:
+ 			return -EINVAL;
+@@ -2719,7 +2722,14 @@ static int smu_set_power_limit(void *handle, uint32_t limit)
+ 		if (smu->ppt_funcs->set_power_limit)
+ 			return smu->ppt_funcs->set_power_limit(smu, limit_type, limit);
+ 
+-	if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) {
++	if (amdgpu_ignore_min_pcap) {
++		if ((limit > smu->max_power_limit)) {
++			dev_err(smu->adev->dev,
++				"New power limit (%d) is over the max allowed %d\n",
++				limit, smu->max_power_limit);
++			return -EINVAL;
++		}
++	} else if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) {
+ 		dev_err(smu->adev->dev,
+ 			"New power limit (%d) is out of range [%d,%d]\n",
+ 			limit, smu->min_power_limit, smu->max_power_limit);
 diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
 index 28eb48dd5b32..1cf4c700b108 100644
 --- a/drivers/i2c/busses/Kconfig
@@ -7573,7 +9046,7 @@ index 000000000000..e105e6f5cc91
 +MODULE_AUTHOR("Daniel Drake <drake@endlessm.com>");
 +MODULE_LICENSE("GPL v2");
 diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
-index d797df6e5f3e..b53d515da054 100644
+index eff7f5df08e2..cfb099dbeb5f 100644
 --- a/drivers/pci/quirks.c
 +++ b/drivers/pci/quirks.c
 @@ -3732,6 +3732,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev)
@@ -8257,10 +9730,10 @@ index 000000000000..77a6677ec19e
 +MODULE_DESCRIPTION("Steam Deck ACPI platform driver");
 +MODULE_LICENSE("GPL");
 diff --git a/include/linux/mm.h b/include/linux/mm.h
-index f5a97dec5169..c9fb00c56844 100644
+index f5a97dec5169..397ad6f1ac39 100644
 --- a/include/linux/mm.h
 +++ b/include/linux/mm.h
-@@ -191,7 +191,7 @@ static inline void __mm_zero_struct_page(struct page *page)
+@@ -191,10 +191,18 @@ static inline void __mm_zero_struct_page(struct page *page)
   * that.
   */
  #define MAPCOUNT_ELF_CORE_MARGIN	(5)
@@ -8269,6 +9742,17 @@ index f5a97dec5169..c9fb00c56844 100644
  
  extern int sysctl_max_map_count;
  
++extern bool sysctl_workingset_protection;
++extern u8 sysctl_anon_min_ratio;
++extern u8 sysctl_clean_low_ratio;
++extern u8 sysctl_clean_min_ratio;
++int vm_workingset_protection_update_handler(
++	struct ctl_table *table, int write,
++	void __user *buffer, size_t *lenp, loff_t *ppos);
++
+ extern unsigned long sysctl_user_reserve_kbytes;
+ extern unsigned long sysctl_admin_reserve_kbytes;
+ 
 diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
 index 2df35e65557d..a52bd9f4b632 100644
 --- a/include/linux/pagemap.h
@@ -8437,7 +9921,7 @@ index 0d944e92a43f..5449c990a91a 100644
  	if (err)
  		goto bad_unshare_out;
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 533547e3c90a..fc0a9de42a9d 100644
+index e2b4e0396af8..97983b041e9d 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
 @@ -73,10 +73,19 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
@@ -8487,7 +9971,7 @@ index 001fe047bd5d..ed5c758c7368 100644
  #else
  #define SCHED_NR_MIGRATE_BREAK 32
 diff --git a/kernel/sysctl.c b/kernel/sysctl.c
-index 157f7ce2942d..c92d8a4b23fb 100644
+index 157f7ce2942d..aa55ebba2ec3 100644
 --- a/kernel/sysctl.c
 +++ b/kernel/sysctl.c
 @@ -95,6 +95,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals);
@@ -8516,6 +10000,47 @@ index 157f7ce2942d..c92d8a4b23fb 100644
  #ifdef CONFIG_PROC_SYSCTL
  	{
  		.procname	= "tainted",
+@@ -2204,6 +2216,40 @@ static struct ctl_table vm_table[] = {
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ #endif
++	{
++		.procname	= "workingset_protection",
++		.data		= &sysctl_workingset_protection,
++		.maxlen		= sizeof(bool),
++		.mode		= 0644,
++		.proc_handler	= &proc_dobool,
++	},
++	{
++		.procname	= "anon_min_ratio",
++		.data		= &sysctl_anon_min_ratio,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler	= &vm_workingset_protection_update_handler,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE_HUNDRED,
++	},
++	{
++		.procname	= "clean_low_ratio",
++		.data		= &sysctl_clean_low_ratio,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler	= &vm_workingset_protection_update_handler,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE_HUNDRED,
++	},
++	{
++		.procname	= "clean_min_ratio",
++		.data		= &sysctl_clean_min_ratio,
++		.maxlen		= sizeof(u8),
++		.mode		= 0644,
++		.proc_handler	= &vm_workingset_protection_update_handler,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE_HUNDRED,
++	},
+ 	{
+ 		.procname	= "user_reserve_kbytes",
+ 		.data		= &sysctl_user_reserve_kbytes,
 diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
 index ce4d99df5f0e..8272e2e359f1 100644
 --- a/kernel/user_namespace.c
@@ -8535,10 +10060,80 @@ index ce4d99df5f0e..8272e2e359f1 100644
  static DEFINE_MUTEX(userns_state_mutex);
  
 diff --git a/mm/Kconfig b/mm/Kconfig
-index ffc3a2ba3a8c..0e440573033c 100644
+index ffc3a2ba3a8c..002f48b655de 100644
 --- a/mm/Kconfig
 +++ b/mm/Kconfig
-@@ -630,7 +630,7 @@ config COMPACTION
+@@ -486,6 +486,69 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+ config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
+ 	bool
+ 
++config ANON_MIN_RATIO
++	int "Default value for vm.anon_min_ratio"
++	depends on SYSCTL
++	range 0 100
++	default 15
++	help
++	  This option sets the default value for vm.anon_min_ratio sysctl knob.
++
++	  The vm.anon_min_ratio sysctl knob provides *hard* protection of
++	  anonymous pages. The anonymous pages on the current node won't be
++	  reclaimed under any conditions when their amount is below
++	  vm.anon_min_ratio. This knob may be used to prevent excessive swap
++	  thrashing when anonymous memory is low (for example, when memory is
++	  going to be overfilled by compressed data of zram module).
++
++	  Setting this value too high (close to MemTotal) can result in
++	  inability to swap and can lead to early OOM under memory pressure.
++
++config CLEAN_LOW_RATIO
++	int "Default value for vm.clean_low_ratio"
++	depends on SYSCTL
++	range 0 100
++	default 0
++	help
++	  This option sets the default value for vm.clean_low_ratio sysctl knob.
++
++	  The vm.clean_low_ratio sysctl knob provides *best-effort*
++	  protection of clean file pages. The file pages on the current node
++	  won't be reclaimed under memory pressure when the amount of clean file
++	  pages is below vm.clean_low_ratio *unless* we threaten to OOM.
++	  Protection of clean file pages using this knob may be used when
++	  swapping is still possible to
++	    - prevent disk I/O thrashing under memory pressure;
++	    - improve performance in disk cache-bound tasks under memory
++	      pressure.
++
++	  Setting it to a high value may result in a early eviction of anonymous
++	  pages into the swap space by attempting to hold the protected amount
++	  of clean file pages in memory.
++
++config CLEAN_MIN_RATIO
++	int "Default value for vm.clean_min_ratio"
++	depends on SYSCTL
++	range 0 100
++	default 15
++	help
++	  This option sets the default value for vm.clean_min_ratio sysctl knob.
++
++	  The vm.clean_min_ratio sysctl knob provides *hard* protection of
++	  clean file pages. The file pages on the current node won't be
++	  reclaimed under memory pressure when the amount of clean file pages is
++	  below vm.clean_min_ratio. Hard protection of clean file pages using
++	  this knob may be used to
++	    - prevent disk I/O thrashing under memory pressure even with no free
++	      swap space;
++	    - improve performance in disk cache-bound tasks under memory
++	      pressure;
++	    - avoid high latency and prevent livelock in near-OOM conditions.
++
++	  Setting it to a high value may result in a early out-of-memory condition
++	  due to the inability to reclaim the protected amount of clean file pages
++	  when other types of pages cannot be reclaimed.
++
+ config HAVE_MEMBLOCK_PHYS_MAP
+ 	bool
+ 
+@@ -630,7 +693,7 @@ config COMPACTION
  config COMPACT_UNEVICTABLE_DEFAULT
  	int
  	depends on COMPACTION
@@ -8579,6 +10174,18 @@ index 94c958f7ebb5..2f9974f305ee 100644
  	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
  	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  
+diff --git a/mm/mm_init.c b/mm/mm_init.c
+index 2c19f5515e36..419ba5ac7c52 100644
+--- a/mm/mm_init.c
++++ b/mm/mm_init.c
+@@ -2749,6 +2749,7 @@ static void __init mem_init_print_info(void)
+ 		, K(totalhigh_pages())
+ #endif
+ 		);
++	printk(KERN_INFO "le9 Unofficial (le9uo) working set protection 1.4 by Masahito Suzuki (forked from hakavlad's original le9 patch)");
+ }
+ 
+ /*
 diff --git a/mm/page-writeback.c b/mm/page-writeback.c
 index 3f255534986a..01b3e5cb8da1 100644
 --- a/mm/page-writeback.c
@@ -8711,10 +10318,38 @@ index bd5183dfd879..3a410f53a07c 100644
  
  /*
 diff --git a/mm/vmscan.c b/mm/vmscan.c
-index 4255619a1a31..5a3fbaf34158 100644
+index 4255619a1a31..62f42e92964f 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
-@@ -185,7 +185,11 @@ struct scan_control {
+@@ -133,6 +133,15 @@ struct scan_control {
+ 	/* The file folios on the current node are dangerously low */
+ 	unsigned int file_is_tiny:1;
+ 
++	/* The anonymous pages on the current node are below vm.anon_min_ratio */
++	unsigned int anon_below_min:1;
++
++	/* The clean file pages on the current node are below vm.clean_low_ratio */
++	unsigned int clean_below_low:1;
++
++	/* The clean file pages on the current node are below vm.clean_min_ratio */
++	unsigned int clean_below_min:1;
++
+ 	/* Always discard instead of demoting to lower tier memory */
+ 	unsigned int no_demotion:1;
+ 
+@@ -182,10 +191,23 @@ struct scan_control {
+ #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
+ #endif
+ 
++bool sysctl_workingset_protection __read_mostly = true;
++u8 sysctl_anon_min_ratio  __read_mostly = CONFIG_ANON_MIN_RATIO;
++u8 sysctl_clean_low_ratio __read_mostly = CONFIG_CLEAN_LOW_RATIO;
++u8 sysctl_clean_min_ratio __read_mostly = CONFIG_CLEAN_MIN_RATIO;
++static u64 sysctl_anon_min_ratio_kb  __read_mostly = 0;
++static u64 sysctl_clean_low_ratio_kb __read_mostly = 0;
++static u64 sysctl_clean_min_ratio_kb __read_mostly = 0;
++static u64 workingset_protection_prev_totalram __read_mostly = 0;
++
  /*
   * From 0 .. 200.  Higher means more swappy.
   */
@@ -8726,7 +10361,56 @@ index 4255619a1a31..5a3fbaf34158 100644
  
  #ifdef CONFIG_MEMCG
  
-@@ -3922,7 +3926,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
+@@ -1052,6 +1074,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
+ 		    folio_mapped(folio) && folio_test_referenced(folio))
+ 			goto keep_locked;
+ 
++		if (folio_is_file_lru(folio) ? sc->clean_below_min : sc->anon_below_min)
++			goto keep_locked;
++
+ 		/*
+ 		 * The number of dirty pages determines if a node is marked
+ 		 * reclaim_congested. kswapd will stall and start writing
+@@ -2353,6 +2378,23 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ 		goto out;
+ 	}
+ 
++	/*
++	 * Force-scan the other type if anon/clean pages is
++	 * under vm.{anon,clean}_{low,min}_ratio, respectively.
++	 */
++	if (sc->clean_below_min) {
++		scan_balance = SCAN_ANON;
++		goto out;
++	}
++	if (sc->anon_below_min) {
++		scan_balance = SCAN_FILE;
++		goto out;
++	}
++	if (sc->clean_below_low) {
++		scan_balance = SCAN_ANON;
++		goto out;
++	}
++
+ 	/*
+ 	 * Do not apply any pressure balancing cleverness when the
+ 	 * system is close to OOM, scan both anon and file equally
+@@ -2515,6 +2557,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ 			BUG();
+ 		}
+ 
++		/*
++		 * Hard protection of the working set.
++		 * Don't reclaim anon/file pages when the amount is
++		 * below the watermark of the same type.
++		 */
++		if (file ? sc->clean_below_min : sc->anon_below_min)
++			scan = 0;
++
+ 		nr[lru] = scan;
+ 	}
+ }
+@@ -3922,7 +3972,28 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
  }
  
  /* to protect the working set of the last N jiffies */
@@ -8735,22 +10419,819 @@ index 4255619a1a31..5a3fbaf34158 100644
 +#else
  static unsigned long lru_gen_min_ttl __read_mostly;
 +#endif
++
++static void do_invoke_oom(struct scan_control *sc, bool try_memcg) {
++	struct oom_control oc = {
++		.gfp_mask = sc->gfp_mask,
++		.order = sc->order,
++	};
++
++	if (try_memcg && mem_cgroup_oom_synchronize(true))
++		return;
++
++	if (!mutex_trylock(&oom_lock))
++		return;
++	out_of_memory(&oc);
++	mutex_unlock(&oom_lock);
++}
++#define invoke_oom(sc)         do_invoke_oom(sc, true)
++#define invoke_oom_nomemcg(sc) do_invoke_oom(sc, false)
  
  static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  {
+@@ -3952,14 +4023,96 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+ 	 * younger than min_ttl. However, another possibility is all memcgs are
+ 	 * either too small or below min.
+ 	 */
+-	if (mutex_trylock(&oom_lock)) {
+-		struct oom_control oc = {
+-			.gfp_mask = sc->gfp_mask,
+-		};
++	invoke_oom_nomemcg(sc);
++}
++
++int vm_workingset_protection_update_handler(struct ctl_table *table, int write,
++		void __user *buffer, size_t *lenp, loff_t *ppos)
++{
++	int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
++	if (ret || !write)
++		return ret;
++
++	workingset_protection_prev_totalram = 0;
++
++	return 0;
++}
++
++static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc)
++{
++	unsigned long node_mem_total;
++	struct sysinfo i;
++
++	if (!(sysctl_workingset_protection)) {
++		sc->anon_below_min = 0;
++		sc->clean_below_low = 0;
++		sc->clean_below_min = 0;
++		return;
++	}
++
++	if (likely(sysctl_anon_min_ratio  ||
++	           sysctl_clean_low_ratio ||
++		       sysctl_clean_min_ratio)) {
++#ifdef CONFIG_NUMA
++		si_meminfo_node(&i, pgdat->node_id);
++#else //CONFIG_NUMA
++		si_meminfo(&i);
++#endif //CONFIG_NUMA
++		node_mem_total = i.totalram;
++
++		if (unlikely(workingset_protection_prev_totalram != node_mem_total)) {
++			sysctl_anon_min_ratio_kb  =
++				node_mem_total * sysctl_anon_min_ratio  / 100;
++			sysctl_clean_low_ratio_kb =
++				node_mem_total * sysctl_clean_low_ratio / 100;
++			sysctl_clean_min_ratio_kb =
++				node_mem_total * sysctl_clean_min_ratio / 100;
++			workingset_protection_prev_totalram = node_mem_total;
++		}
++	}
+ 
+-		out_of_memory(&oc);
++	/*
++	 * Check the number of anonymous pages to protect them from
++	 * reclaiming if their amount is below the specified.
++	 */
++	if (sysctl_anon_min_ratio) {
++		unsigned long reclaimable_anon;
++
++		reclaimable_anon =
++			node_page_state(pgdat, NR_ACTIVE_ANON) +
++			node_page_state(pgdat, NR_INACTIVE_ANON) +
++			node_page_state(pgdat, NR_ISOLATED_ANON);
++
++		sc->anon_below_min = reclaimable_anon < sysctl_anon_min_ratio_kb;
++	} else
++		sc->anon_below_min = 0;
++
++	/*
++	 * Check the number of clean file pages to protect them from
++	 * reclaiming if their amount is below the specified.
++	 */
++	if (sysctl_clean_low_ratio || sysctl_clean_min_ratio) {
++		unsigned long reclaimable_file, dirty, clean;
++
++		reclaimable_file =
++			node_page_state(pgdat, NR_ACTIVE_FILE) +
++			node_page_state(pgdat, NR_INACTIVE_FILE) +
++			node_page_state(pgdat, NR_ISOLATED_FILE);
++		dirty = node_page_state(pgdat, NR_FILE_DIRTY);
++		/*
++		 * node_page_state() sum can go out of sync since
++		 * all the values are not read at once.
++		 */
++		if (likely(reclaimable_file > dirty))
++			clean = reclaimable_file - dirty;
++		else
++			clean = 0;
+ 
+-		mutex_unlock(&oom_lock);
++		sc->clean_below_low = clean < sysctl_clean_low_ratio_kb;
++		sc->clean_below_min = clean < sysctl_clean_min_ratio_kb;
++	} else {
++		sc->clean_below_low = 0;
++		sc->clean_below_min = 0;
+ 	}
+ }
+ 
+@@ -4462,6 +4615,12 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
+ 	 */
+ 	if (!swappiness)
+ 		type = LRU_GEN_FILE;
++	else if (sc->clean_below_min)
++		type = LRU_GEN_ANON;
++	else if (sc->anon_below_min)
++		type = LRU_GEN_FILE;
++	else if (sc->clean_below_low)
++		type = LRU_GEN_ANON;
+ 	else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
+ 		type = LRU_GEN_ANON;
+ 	else if (swappiness == 1)
+@@ -4471,7 +4630,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
+ 	else
+ 		type = get_type_to_scan(lruvec, swappiness, &tier);
+ 
+-	for (i = !swappiness; i < ANON_AND_FILE; i++) {
++	for (i = 0; i < ANON_AND_FILE; i++) {
+ 		if (tier < 0)
+ 			tier = get_tier_idx(lruvec, type);
+ 
+@@ -4749,6 +4908,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ 
++	prepare_workingset_protection(pgdat, sc);
+ 	mem_cgroup_calculate_protection(NULL, memcg);
+ 
+ 	if (mem_cgroup_below_min(NULL, memcg))
+@@ -5899,6 +6059,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+ 
+ 	prepare_scan_control(pgdat, sc);
+ 
++	prepare_workingset_protection(pgdat, sc);
++
+ 	shrink_node_memcgs(pgdat, sc);
+ 
+ 	flush_reclaim_state(sc);
+@@ -5987,6 +6149,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+ 	 */
+ 	if (reclaimable)
+ 		pgdat->kswapd_failures = 0;
++	else if (sc->clean_below_min && !sc->priority)
++		invoke_oom(sc);
+ }
+ 
+ /*
 -- 
 2.44.0
 
-From 516559b0e31629dafbe60212d041e63af1b12c1c Mon Sep 17 00:00:00 2001
+From 3719b448ce6ae6e6df7f49a99ef30eeb0bf2117d Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 26 Feb 2024 15:47:43 +0100
-Subject: [PATCH 5/7] fixes
+Date: Wed, 3 Apr 2024 17:43:37 +0200
+Subject: [PATCH] Revert "le9uo"
+
+This reverts commit 9bb31a68ef456524c4370323e1c19b07fc0633df.
+---
+ Documentation/admin-guide/sysctl/vm.rst |  72 ----------
+ include/linux/mm.h                      |   8 --
+ kernel/sysctl.c                         |  34 -----
+ mm/Kconfig                              |  63 ---------
+ mm/mm_init.c                            |   1 -
+ mm/vmscan.c                             | 170 +-----------------------
+ 6 files changed, 7 insertions(+), 341 deletions(-)
+
+diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
+index 468ae7dec1e1..c59889de122b 100644
+--- a/Documentation/admin-guide/sysctl/vm.rst
++++ b/Documentation/admin-guide/sysctl/vm.rst
+@@ -25,9 +25,6 @@ files can be found in mm/swap.c.
+ Currently, these files are in /proc/sys/vm:
+ 
+ - admin_reserve_kbytes
+-- anon_min_ratio
+-- clean_low_ratio
+-- clean_min_ratio
+ - compact_memory
+ - compaction_proactiveness
+ - compact_unevictable_allowed
+@@ -109,67 +106,6 @@ On x86_64 this is about 128MB.
+ Changing this takes effect whenever an application requests memory.
+ 
+ 
+-anon_min_ratio
+-==============
+-
+-This knob provides *hard* protection of anonymous pages. The anonymous pages
+-on the current node won't be reclaimed under any conditions when their amount
+-is below vm.anon_min_ratio.
+-
+-This knob may be used to prevent excessive swap thrashing when anonymous
+-memory is low (for example, when memory is going to be overfilled by
+-compressed data of zram module).
+-
+-Setting this value too high (close to 100) can result in inability to
+-swap and can lead to early OOM under memory pressure.
+-
+-The unit of measurement is the percentage of the total memory of the node.
+-
+-The default value is 15.
+-
+-
+-clean_low_ratio
+-================
+-
+-This knob provides *best-effort* protection of clean file pages. The file pages
+-on the current node won't be reclaimed under memory pressure when the amount of
+-clean file pages is below vm.clean_low_ratio *unless* we threaten to OOM.
+-
+-Protection of clean file pages using this knob may be used when swapping is
+-still possible to
+-  - prevent disk I/O thrashing under memory pressure;
+-  - improve performance in disk cache-bound tasks under memory pressure.
+-
+-Setting it to a high value may result in a early eviction of anonymous pages
+-into the swap space by attempting to hold the protected amount of clean file
+-pages in memory.
+-
+-The unit of measurement is the percentage of the total memory of the node.
+-
+-The default value is 0.
+-
+-
+-clean_min_ratio
+-================
+-
+-This knob provides *hard* protection of clean file pages. The file pages on the
+-current node won't be reclaimed under memory pressure when the amount of clean
+-file pages is below vm.clean_min_ratio.
+-
+-Hard protection of clean file pages using this knob may be used to
+-  - prevent disk I/O thrashing under memory pressure even with no free swap space;
+-  - improve performance in disk cache-bound tasks under memory pressure;
+-  - avoid high latency and prevent livelock in near-OOM conditions.
+-
+-Setting it to a high value may result in a early out-of-memory condition due to
+-the inability to reclaim the protected amount of clean file pages when other
+-types of pages cannot be reclaimed.
+-
+-The unit of measurement is the percentage of the total memory of the node.
+-
+-The default value is 15.
+-
+-
+ compact_memory
+ ==============
+ 
+@@ -974,14 +910,6 @@ be 133 (x + 2x = 200, 2x = 133.33).
+ At 0, the kernel will not initiate swap until the amount of free and
+ file-backed pages is less than the high watermark in a zone.
+ 
+-This knob has no effect if the amount of clean file pages on the current
+-node is below vm.clean_low_ratio or vm.clean_min_ratio. In this case,
+-only anonymous pages can be reclaimed.
+-
+-If the number of anonymous pages on the current node is below
+-vm.anon_min_ratio, then only file pages can be reclaimed with
+-any vm.swappiness value.
+-
+ 
+ unprivileged_userfaultfd
+ ========================
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 397ad6f1ac39..c9fb00c56844 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -195,14 +195,6 @@ static inline void __mm_zero_struct_page(struct page *page)
+ 
+ extern int sysctl_max_map_count;
+ 
+-extern bool sysctl_workingset_protection;
+-extern u8 sysctl_anon_min_ratio;
+-extern u8 sysctl_clean_low_ratio;
+-extern u8 sysctl_clean_min_ratio;
+-int vm_workingset_protection_update_handler(
+-	struct ctl_table *table, int write,
+-	void __user *buffer, size_t *lenp, loff_t *ppos);
+-
+ extern unsigned long sysctl_user_reserve_kbytes;
+ extern unsigned long sysctl_admin_reserve_kbytes;
+ 
+diff --git a/kernel/sysctl.c b/kernel/sysctl.c
+index aa55ebba2ec3..c92d8a4b23fb 100644
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -2216,40 +2216,6 @@ static struct ctl_table vm_table[] = {
+ 		.extra1		= SYSCTL_ZERO,
+ 	},
+ #endif
+-	{
+-		.procname	= "workingset_protection",
+-		.data		= &sysctl_workingset_protection,
+-		.maxlen		= sizeof(bool),
+-		.mode		= 0644,
+-		.proc_handler	= &proc_dobool,
+-	},
+-	{
+-		.procname	= "anon_min_ratio",
+-		.data		= &sysctl_anon_min_ratio,
+-		.maxlen		= sizeof(u8),
+-		.mode		= 0644,
+-		.proc_handler	= &vm_workingset_protection_update_handler,
+-		.extra1		= SYSCTL_ZERO,
+-		.extra2		= SYSCTL_ONE_HUNDRED,
+-	},
+-	{
+-		.procname	= "clean_low_ratio",
+-		.data		= &sysctl_clean_low_ratio,
+-		.maxlen		= sizeof(u8),
+-		.mode		= 0644,
+-		.proc_handler	= &vm_workingset_protection_update_handler,
+-		.extra1		= SYSCTL_ZERO,
+-		.extra2		= SYSCTL_ONE_HUNDRED,
+-	},
+-	{
+-		.procname	= "clean_min_ratio",
+-		.data		= &sysctl_clean_min_ratio,
+-		.maxlen		= sizeof(u8),
+-		.mode		= 0644,
+-		.proc_handler	= &vm_workingset_protection_update_handler,
+-		.extra1		= SYSCTL_ZERO,
+-		.extra2		= SYSCTL_ONE_HUNDRED,
+-	},
+ 	{
+ 		.procname	= "user_reserve_kbytes",
+ 		.data		= &sysctl_user_reserve_kbytes,
+diff --git a/mm/Kconfig b/mm/Kconfig
+index 002f48b655de..0e440573033c 100644
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -486,69 +486,6 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+ config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
+ 	bool
+ 
+-config ANON_MIN_RATIO
+-	int "Default value for vm.anon_min_ratio"
+-	depends on SYSCTL
+-	range 0 100
+-	default 15
+-	help
+-	  This option sets the default value for vm.anon_min_ratio sysctl knob.
+-
+-	  The vm.anon_min_ratio sysctl knob provides *hard* protection of
+-	  anonymous pages. The anonymous pages on the current node won't be
+-	  reclaimed under any conditions when their amount is below
+-	  vm.anon_min_ratio. This knob may be used to prevent excessive swap
+-	  thrashing when anonymous memory is low (for example, when memory is
+-	  going to be overfilled by compressed data of zram module).
+-
+-	  Setting this value too high (close to MemTotal) can result in
+-	  inability to swap and can lead to early OOM under memory pressure.
+-
+-config CLEAN_LOW_RATIO
+-	int "Default value for vm.clean_low_ratio"
+-	depends on SYSCTL
+-	range 0 100
+-	default 0
+-	help
+-	  This option sets the default value for vm.clean_low_ratio sysctl knob.
+-
+-	  The vm.clean_low_ratio sysctl knob provides *best-effort*
+-	  protection of clean file pages. The file pages on the current node
+-	  won't be reclaimed under memory pressure when the amount of clean file
+-	  pages is below vm.clean_low_ratio *unless* we threaten to OOM.
+-	  Protection of clean file pages using this knob may be used when
+-	  swapping is still possible to
+-	    - prevent disk I/O thrashing under memory pressure;
+-	    - improve performance in disk cache-bound tasks under memory
+-	      pressure.
+-
+-	  Setting it to a high value may result in a early eviction of anonymous
+-	  pages into the swap space by attempting to hold the protected amount
+-	  of clean file pages in memory.
+-
+-config CLEAN_MIN_RATIO
+-	int "Default value for vm.clean_min_ratio"
+-	depends on SYSCTL
+-	range 0 100
+-	default 15
+-	help
+-	  This option sets the default value for vm.clean_min_ratio sysctl knob.
+-
+-	  The vm.clean_min_ratio sysctl knob provides *hard* protection of
+-	  clean file pages. The file pages on the current node won't be
+-	  reclaimed under memory pressure when the amount of clean file pages is
+-	  below vm.clean_min_ratio. Hard protection of clean file pages using
+-	  this knob may be used to
+-	    - prevent disk I/O thrashing under memory pressure even with no free
+-	      swap space;
+-	    - improve performance in disk cache-bound tasks under memory
+-	      pressure;
+-	    - avoid high latency and prevent livelock in near-OOM conditions.
+-
+-	  Setting it to a high value may result in a early out-of-memory condition
+-	  due to the inability to reclaim the protected amount of clean file pages
+-	  when other types of pages cannot be reclaimed.
+-
+ config HAVE_MEMBLOCK_PHYS_MAP
+ 	bool
+ 
+diff --git a/mm/mm_init.c b/mm/mm_init.c
+index 419ba5ac7c52..2c19f5515e36 100644
+--- a/mm/mm_init.c
++++ b/mm/mm_init.c
+@@ -2749,7 +2749,6 @@ static void __init mem_init_print_info(void)
+ 		, K(totalhigh_pages())
+ #endif
+ 		);
+-	printk(KERN_INFO "le9 Unofficial (le9uo) working set protection 1.4 by Masahito Suzuki (forked from hakavlad's original le9 patch)");
+ }
+ 
+ /*
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 346810e1b69d..fd1d9b4194e3 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -133,15 +133,6 @@ struct scan_control {
+ 	/* The file folios on the current node are dangerously low */
+ 	unsigned int file_is_tiny:1;
+ 
+-	/* The anonymous pages on the current node are below vm.anon_min_ratio */
+-	unsigned int anon_below_min:1;
+-
+-	/* The clean file pages on the current node are below vm.clean_low_ratio */
+-	unsigned int clean_below_low:1;
+-
+-	/* The clean file pages on the current node are below vm.clean_min_ratio */
+-	unsigned int clean_below_min:1;
+-
+ 	/* Always discard instead of demoting to lower tier memory */
+ 	unsigned int no_demotion:1;
+ 
+@@ -191,15 +182,6 @@ struct scan_control {
+ #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
+ #endif
+ 
+-bool sysctl_workingset_protection __read_mostly = true;
+-u8 sysctl_anon_min_ratio  __read_mostly = CONFIG_ANON_MIN_RATIO;
+-u8 sysctl_clean_low_ratio __read_mostly = CONFIG_CLEAN_LOW_RATIO;
+-u8 sysctl_clean_min_ratio __read_mostly = CONFIG_CLEAN_MIN_RATIO;
+-static u64 sysctl_anon_min_ratio_kb  __read_mostly = 0;
+-static u64 sysctl_clean_low_ratio_kb __read_mostly = 0;
+-static u64 sysctl_clean_min_ratio_kb __read_mostly = 0;
+-static u64 workingset_protection_prev_totalram __read_mostly = 0;
+-
+ /*
+  * From 0 .. 200.  Higher means more swappy.
+  */
+@@ -1074,9 +1056,6 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
+ 		    folio_mapped(folio) && folio_test_referenced(folio))
+ 			goto keep_locked;
+ 
+-		if (folio_is_file_lru(folio) ? sc->clean_below_min : sc->anon_below_min)
+-			goto keep_locked;
+-
+ 		/*
+ 		 * The number of dirty pages determines if a node is marked
+ 		 * reclaim_congested. kswapd will stall and start writing
+@@ -2378,23 +2357,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ 		goto out;
+ 	}
+ 
+-	/*
+-	 * Force-scan the other type if anon/clean pages is
+-	 * under vm.{anon,clean}_{low,min}_ratio, respectively.
+-	 */
+-	if (sc->clean_below_min) {
+-		scan_balance = SCAN_ANON;
+-		goto out;
+-	}
+-	if (sc->anon_below_min) {
+-		scan_balance = SCAN_FILE;
+-		goto out;
+-	}
+-	if (sc->clean_below_low) {
+-		scan_balance = SCAN_ANON;
+-		goto out;
+-	}
+-
+ 	/*
+ 	 * Do not apply any pressure balancing cleverness when the
+ 	 * system is close to OOM, scan both anon and file equally
+@@ -2557,14 +2519,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+ 			BUG();
+ 		}
+ 
+-		/*
+-		 * Hard protection of the working set.
+-		 * Don't reclaim anon/file pages when the amount is
+-		 * below the watermark of the same type.
+-		 */
+-		if (file ? sc->clean_below_min : sc->anon_below_min)
+-			scan = 0;
+-
+ 		nr[lru] = scan;
+ 	}
+ }
+@@ -3978,23 +3932,6 @@ static unsigned long lru_gen_min_ttl __read_mostly = 1000;
+ static unsigned long lru_gen_min_ttl __read_mostly;
+ #endif
+ 
+-static void do_invoke_oom(struct scan_control *sc, bool try_memcg) {
+-	struct oom_control oc = {
+-		.gfp_mask = sc->gfp_mask,
+-		.order = sc->order,
+-	};
+-
+-	if (try_memcg && mem_cgroup_oom_synchronize(true))
+-		return;
+-
+-	if (!mutex_trylock(&oom_lock))
+-		return;
+-	out_of_memory(&oc);
+-	mutex_unlock(&oom_lock);
+-}
+-#define invoke_oom(sc)         do_invoke_oom(sc, true)
+-#define invoke_oom_nomemcg(sc) do_invoke_oom(sc, false)
+-
+ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+ {
+ 	struct mem_cgroup *memcg;
+@@ -4023,96 +3960,14 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
+ 	 * younger than min_ttl. However, another possibility is all memcgs are
+ 	 * either too small or below min.
+ 	 */
+-	invoke_oom_nomemcg(sc);
+-}
+-
+-int vm_workingset_protection_update_handler(struct ctl_table *table, int write,
+-		void __user *buffer, size_t *lenp, loff_t *ppos)
+-{
+-	int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+-	if (ret || !write)
+-		return ret;
+-
+-	workingset_protection_prev_totalram = 0;
+-
+-	return 0;
+-}
+-
+-static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc)
+-{
+-	unsigned long node_mem_total;
+-	struct sysinfo i;
+-
+-	if (!(sysctl_workingset_protection)) {
+-		sc->anon_below_min = 0;
+-		sc->clean_below_low = 0;
+-		sc->clean_below_min = 0;
+-		return;
+-	}
+-
+-	if (likely(sysctl_anon_min_ratio  ||
+-	           sysctl_clean_low_ratio ||
+-		       sysctl_clean_min_ratio)) {
+-#ifdef CONFIG_NUMA
+-		si_meminfo_node(&i, pgdat->node_id);
+-#else //CONFIG_NUMA
+-		si_meminfo(&i);
+-#endif //CONFIG_NUMA
+-		node_mem_total = i.totalram;
+-
+-		if (unlikely(workingset_protection_prev_totalram != node_mem_total)) {
+-			sysctl_anon_min_ratio_kb  =
+-				node_mem_total * sysctl_anon_min_ratio  / 100;
+-			sysctl_clean_low_ratio_kb =
+-				node_mem_total * sysctl_clean_low_ratio / 100;
+-			sysctl_clean_min_ratio_kb =
+-				node_mem_total * sysctl_clean_min_ratio / 100;
+-			workingset_protection_prev_totalram = node_mem_total;
+-		}
+-	}
+-
+-	/*
+-	 * Check the number of anonymous pages to protect them from
+-	 * reclaiming if their amount is below the specified.
+-	 */
+-	if (sysctl_anon_min_ratio) {
+-		unsigned long reclaimable_anon;
+-
+-		reclaimable_anon =
+-			node_page_state(pgdat, NR_ACTIVE_ANON) +
+-			node_page_state(pgdat, NR_INACTIVE_ANON) +
+-			node_page_state(pgdat, NR_ISOLATED_ANON);
++	if (mutex_trylock(&oom_lock)) {
++		struct oom_control oc = {
++			.gfp_mask = sc->gfp_mask,
++		};
+ 
+-		sc->anon_below_min = reclaimable_anon < sysctl_anon_min_ratio_kb;
+-	} else
+-		sc->anon_below_min = 0;
++		out_of_memory(&oc);
+ 
+-	/*
+-	 * Check the number of clean file pages to protect them from
+-	 * reclaiming if their amount is below the specified.
+-	 */
+-	if (sysctl_clean_low_ratio || sysctl_clean_min_ratio) {
+-		unsigned long reclaimable_file, dirty, clean;
+-
+-		reclaimable_file =
+-			node_page_state(pgdat, NR_ACTIVE_FILE) +
+-			node_page_state(pgdat, NR_INACTIVE_FILE) +
+-			node_page_state(pgdat, NR_ISOLATED_FILE);
+-		dirty = node_page_state(pgdat, NR_FILE_DIRTY);
+-		/*
+-		 * node_page_state() sum can go out of sync since
+-		 * all the values are not read at once.
+-		 */
+-		if (likely(reclaimable_file > dirty))
+-			clean = reclaimable_file - dirty;
+-		else
+-			clean = 0;
+-
+-		sc->clean_below_low = clean < sysctl_clean_low_ratio_kb;
+-		sc->clean_below_min = clean < sysctl_clean_min_ratio_kb;
+-	} else {
+-		sc->clean_below_low = 0;
+-		sc->clean_below_min = 0;
++		mutex_unlock(&oom_lock);
+ 	}
+ }
+ 
+@@ -4615,12 +4470,6 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
+ 	 */
+ 	if (!swappiness)
+ 		type = LRU_GEN_FILE;
+-	else if (sc->clean_below_min)
+-		type = LRU_GEN_ANON;
+-	else if (sc->anon_below_min)
+-		type = LRU_GEN_FILE;
+-	else if (sc->clean_below_low)
+-		type = LRU_GEN_ANON;
+ 	else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
+ 		type = LRU_GEN_ANON;
+ 	else if (swappiness == 1)
+@@ -4630,7 +4479,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
+ 	else
+ 		type = get_type_to_scan(lruvec, swappiness, &tier);
+ 
+-	for (i = 0; i < ANON_AND_FILE; i++) {
++	for (i = !swappiness; i < ANON_AND_FILE; i++) {
+ 		if (tier < 0)
+ 			tier = get_tier_idx(lruvec, type);
+ 
+@@ -4908,7 +4757,6 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
+ 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ 
+-	prepare_workingset_protection(pgdat, sc);
+ 	mem_cgroup_calculate_protection(NULL, memcg);
+ 
+ 	if (mem_cgroup_below_min(NULL, memcg))
+@@ -6059,8 +5907,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+ 
+ 	prepare_scan_control(pgdat, sc);
+ 
+-	prepare_workingset_protection(pgdat, sc);
+-
+ 	shrink_node_memcgs(pgdat, sc);
+ 
+ 	flush_reclaim_state(sc);
+@@ -6149,8 +5995,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+ 	 */
+ 	if (reclaimable)
+ 		pgdat->kswapd_failures = 0;
+-	else if (sc->clean_below_min && !sc->priority)
+-		invoke_oom(sc);
+ }
+ 
+ /*
+-- 
+2.44.0
+
+From 4833f48c9738d6bb475df2e4c16be2ea26a7d91d Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Wed, 3 Apr 2024 17:07:02 +0200
+Subject: [PATCH 6/8] fixes
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- arch/Kconfig | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
+ .../ABI/testing/sysfs-driver-hid-asus         |   85 +
+ arch/Kconfig                                  |    4 +-
+ drivers/hid/Makefile                          |    2 +
+ drivers/hid/{hid-asus.c => hid-asus-core.c}   |  193 +--
+ drivers/hid/hid-asus-rog.c                    | 1468 +++++++++++++++++
+ drivers/hid/hid-asus-rog.h                    |  482 ++++++
+ drivers/hid/hid-asus.h                        |   58 +
+ drivers/hid/hid-ids.h                         |    1 +
+ 8 files changed, 2174 insertions(+), 119 deletions(-)
+ create mode 100644 Documentation/ABI/testing/sysfs-driver-hid-asus
+ rename drivers/hid/{hid-asus.c => hid-asus-core.c} (89%)
+ create mode 100644 drivers/hid/hid-asus-rog.c
+ create mode 100644 drivers/hid/hid-asus-rog.h
+ create mode 100644 drivers/hid/hid-asus.h
 
+diff --git a/Documentation/ABI/testing/sysfs-driver-hid-asus b/Documentation/ABI/testing/sysfs-driver-hid-asus
+new file mode 100644
+index 000000000000..df5b0c5b0702
+--- /dev/null
++++ b/Documentation/ABI/testing/sysfs-driver-hid-asus
+@@ -0,0 +1,85 @@
++What:		/sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/gamepad_mode
++Date:		December 2023
++Contact:	linux-input@vger.kernel.org
++Description:	Set the mode the ROG Ally xpad operates in:
++		- 1 = Game mode
++		- 2 = WASD mode
++		- 3 = Mouse mode
++		This setting applies instantly and applies settings that were previously changed
++		under that mode which are:
++		- deadzones
++		- anti-deadzones
++		- button mapping
++		- button turbo settings
++		- response curves
++
++What:		/sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/apply
++Date:		December 2023
++Contact:	linux-input@vger.kernel.org
++Description:	Apply the settings that have been stored in attributes so far. Because there are
++		many individual settings across a dozen packets this separation is required to
++		prevent spamming the MCU when userspace applications apply many changes at once.
++
++What:		/sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/reset_btn_mapping
++Date:		December 2023
++Contact:	linux-input@vger.kernel.org
++Description:	Reset a gamepad mode to its default button mapping.
++
++What:		/sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis_<x/y/z>_<left/right>/deadzone
++Date:		December 2023
++Contact:	linux-input@vger.kernel.org
++Description:	Set the inner and outer deadzones of joysticks and triggers. These settings are not
++		written to the MCU until `apply` is set.
++		- range 0-64 (corresponds to 0-100%)
++
++What:		/sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis_<x/y/z>_<left/right>/deadzone_index
++Date:		December 2023
++Contact:	linux-input@vger.kernel.org
++Description:	Descriptive labels for joystick deadzone array.
++
++What:		/sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis_<x/y>_<left/right>/anti-deadzone
++Date:		December 2023
++Contact:	linux-input@vger.kernel.org
++Description:	Set the joystick anti-deadzone feature:
++		- range 0-32 (corresponds to 0-50%)
++
++What:		/sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis_<x/y/z>_<left/right>/calibration
++Date:		December 2023
++Contact:	linux-input@vger.kernel.org
++Description:	Calibration values for the joysticks and trigger analogues. There are no default
++		values as the calibration is determined in userspace.
++
++What:		/sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis_<x/y/z>_<left/right>/calibration_index
++Date:		December 2023
++Contact:	linux-input@vger.kernel.org
++Description:	Descriptive labels for joystick and triggers calibration array.
++
++What:		/sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis_<x/y>_<left/right>/rc_point<n>
++Date:		December 2023
++Contact:	linux-input@vger.kernel.org
++Description:	Set the joystick response curve. There are 4 points available with 1 being the lowest
++		point and 4 being the highest point.
++		- range 0-64 (corresponds to 0-100%)
++
++What:		/sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis_<x/y>_<left/right>/rc_point_index
++Date:		December 2023
++Contact:	linux-input@vger.kernel.org
++Description:	Descriptive labels for joystick response curve points.
++
++What:		/sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/btn_<label>/turbo
++Date:		December 2023
++Contact:	linux-input@vger.kernel.org
++Description:	Set the turbo mode of the button:
++		- 0 = no turbo, a separate press and release is registered on press and release
++		- 1-16 = interval between presses if button held down in steps of 1000ms/16
++		These settings are not written to the MCU until `apply` is set.
++
++What:		/sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/vibration_intensity
++Date:		December 2023
++Contact:	linux-input@vger.kernel.org
++Description:	Set the vibration intensity for left and right haptics. Applies instantly.
++
++What:		/sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/vibration_intensity_index
++Date:		December 2023
++Contact:	linux-input@vger.kernel.org
++Description:	Descriptive labels for index points of vibration_intensity.
+\ No newline at end of file
 diff --git a/arch/Kconfig b/arch/Kconfig
 index a5af0edd3eb8..0731bc203aa9 100644
 --- a/arch/Kconfig
@@ -8773,13 +11254,2398 @@ index a5af0edd3eb8..0731bc203aa9 100644
  	depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS
  	help
  	  This value can be used to select the number of bits to use to
+diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile
+index 082a728eac60..7eefb548e33a 100644
+--- a/drivers/hid/Makefile
++++ b/drivers/hid/Makefile
+@@ -23,6 +23,8 @@ hid-logitech-$(CONFIG_LOGIWHEELS_FF)	+= hid-lg4ff.o
+ hid-wiimote-y		:= hid-wiimote-core.o hid-wiimote-modules.o
+ hid-wiimote-$(CONFIG_DEBUG_FS)	+= hid-wiimote-debug.o
+ 
++hid-asus-y		:= hid-asus-core.o hid-asus-rog.o
++
+ obj-$(CONFIG_HID_A4TECH)	+= hid-a4tech.o
+ obj-$(CONFIG_HID_ACCUTOUCH)	+= hid-accutouch.o
+ obj-$(CONFIG_HID_ALPS)		+= hid-alps.o
+diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus-core.c
+similarity index 89%
+rename from drivers/hid/hid-asus.c
+rename to drivers/hid/hid-asus-core.c
+index 78cdfb8b9a7a..026705c43ee1 100644
+--- a/drivers/hid/hid-asus.c
++++ b/drivers/hid/hid-asus-core.c
+@@ -20,9 +20,8 @@
+  *  Copyright (c) 2016 Frederik Wenigwieser <frederik.wenigwieser@gmail.com>
+  */
+ 
+-/*
+- */
+-
++#include <asm-generic/errno-base.h>
++#include <asm-generic/errno.h>
+ #include <linux/dmi.h>
+ #include <linux/hid.h>
+ #include <linux/module.h>
+@@ -32,6 +31,7 @@
+ #include <linux/power_supply.h>
+ #include <linux/leds.h>
+ 
++#include "hid-asus.h"
+ #include "hid-ids.h"
+ 
+ MODULE_AUTHOR("Yusuke Fujimaki <usk.fujimaki@gmail.com>");
+@@ -47,10 +47,6 @@ MODULE_DESCRIPTION("Asus HID Keyboard and TouchPad");
+ #define T100CHI_MOUSE_REPORT_ID 0x06
+ #define FEATURE_REPORT_ID 0x0d
+ #define INPUT_REPORT_ID 0x5d
+-#define FEATURE_KBD_REPORT_ID 0x5a
+-#define FEATURE_KBD_REPORT_SIZE 16
+-#define FEATURE_KBD_LED_REPORT_ID1 0x5d
+-#define FEATURE_KBD_LED_REPORT_ID2 0x5e
+ 
+ #define SUPPORT_KBD_BACKLIGHT BIT(0)
+ 
+@@ -71,20 +67,6 @@ MODULE_DESCRIPTION("Asus HID Keyboard and TouchPad");
+ #define	BATTERY_STAT_CHARGING	(1)
+ #define	BATTERY_STAT_FULL	(2)
+ 
+-#define QUIRK_FIX_NOTEBOOK_REPORT	BIT(0)
+-#define QUIRK_NO_INIT_REPORTS		BIT(1)
+-#define QUIRK_SKIP_INPUT_MAPPING	BIT(2)
+-#define QUIRK_IS_MULTITOUCH		BIT(3)
+-#define QUIRK_NO_CONSUMER_USAGES	BIT(4)
+-#define QUIRK_USE_KBD_BACKLIGHT		BIT(5)
+-#define QUIRK_T100_KEYBOARD		BIT(6)
+-#define QUIRK_T100CHI			BIT(7)
+-#define QUIRK_G752_KEYBOARD		BIT(8)
+-#define QUIRK_T90CHI			BIT(9)
+-#define QUIRK_MEDION_E1239T		BIT(10)
+-#define QUIRK_ROG_NKEY_KEYBOARD		BIT(11)
+-#define QUIRK_ROG_CLAYMORE_II_KEYBOARD BIT(12)
+-
+ #define I2C_KEYBOARD_QUIRKS			(QUIRK_FIX_NOTEBOOK_REPORT | \
+ 						 QUIRK_NO_INIT_REPORTS | \
+ 						 QUIRK_NO_CONSUMER_USAGES)
+@@ -113,22 +95,6 @@ struct asus_touchpad_info {
+ 	int report_size;
+ };
+ 
+-struct asus_drvdata {
+-	unsigned long quirks;
+-	struct hid_device *hdev;
+-	struct input_dev *input;
+-	struct input_dev *tp_kbd_input;
+-	struct asus_kbd_leds *kbd_backlight;
+-	const struct asus_touchpad_info *tp;
+-	bool enable_backlight;
+-	struct power_supply *battery;
+-	struct power_supply_desc battery_desc;
+-	int battery_capacity;
+-	int battery_stat;
+-	bool battery_in_query;
+-	unsigned long battery_next_query;
+-};
+-
+ static int asus_report_battery(struct asus_drvdata *, u8 *, int);
+ 
+ static const struct asus_touchpad_info asus_i2c_tp = {
+@@ -329,42 +295,36 @@ static int asus_raw_event(struct hid_device *hdev,
+ 	if (drvdata->battery && data[0] == BATTERY_REPORT_ID)
+ 		return asus_report_battery(drvdata, data, size);
+ 
++	// TODO: remove after debugging
++	// if (data[0] == 0x5a || data[0] == 0x5d || data[0] == 0x5e){
++	// 	 for (int i = 0; i < size; i++) {
++	// 		if (i == 0)
++	// 			printk(KERN_INFO "GOT: %02x,", data[i]);
++	// 		else
++	// 			printk(KERN_CONT "%02x,", data[i]);
++	// 	 }
++	// }
++
+ 	if (drvdata->tp && data[0] == INPUT_REPORT_ID)
+ 		return asus_report_input(drvdata, data, size);
+ 
+ 	if (drvdata->quirks & QUIRK_MEDION_E1239T)
+ 		return asus_e1239t_event(drvdata, data, size);
+ 
+-	if (drvdata->quirks & QUIRK_USE_KBD_BACKLIGHT) {
++	/*
++	 * Skip these report ID, the device emits a continuous stream associated
++	 * with the AURA mode it is in which looks like an 'echo'.
++	*/
++	if (report->id == FEATURE_KBD_LED_REPORT_ID1 || report->id == FEATURE_KBD_LED_REPORT_ID2)
++		return -1;
++	if (drvdata->quirks & QUIRK_ROG_NKEY_KEYBOARD) {
+ 		/*
+-		 * Skip these report ID, the device emits a continuous stream associated
+-		 * with the AURA mode it is in which looks like an 'echo'.
++		 * G713 and G733 send these codes on some keypresses, depending on
++		 * the key pressed it can trigger a shutdown event if not caught.
+ 		*/
+-		if (report->id == FEATURE_KBD_LED_REPORT_ID1 ||
+-				report->id == FEATURE_KBD_LED_REPORT_ID2) {
++		if(data[0] == 0x02 && data[1] == 0x30) {
+ 			return -1;
+-		/* Additional report filtering */
+-		} else if (report->id == FEATURE_KBD_REPORT_ID) {
+-			/*
+-			 * G14 and G15 send these codes on some keypresses with no
+-			 * discernable reason for doing so. We'll filter them out to avoid
+-			 * unmapped warning messages later.
+-			*/
+-			if (data[1] == 0xea || data[1] == 0xec || data[1] == 0x02 ||
+-					data[1] == 0x8a || data[1] == 0x9e) {
+-				return -1;
+-			}
+-		}
+-		if (drvdata->quirks & QUIRK_ROG_NKEY_KEYBOARD) {
+-			/*
+-			 * G713 and G733 send these codes on some keypresses, depending on
+-			 * the key pressed it can trigger a shutdown event if not caught.
+-			*/
+-			if(data[0] == 0x02 && data[1] == 0x30) {
+-				return -1;
+-			}
+ 		}
+-
+ 	}
+ 
+ 	if (drvdata->quirks & QUIRK_ROG_CLAYMORE_II_KEYBOARD) {
+@@ -381,7 +341,7 @@ static int asus_raw_event(struct hid_device *hdev,
+ 	return 0;
+ }
+ 
+-static int asus_kbd_set_report(struct hid_device *hdev, const u8 *buf, size_t buf_size)
++int asus_kbd_set_report(struct hid_device *hdev, const u8 *buf, size_t buf_size)
+ {
+ 	unsigned char *dmabuf;
+ 	int ret;
+@@ -402,9 +362,16 @@ static int asus_kbd_set_report(struct hid_device *hdev, const u8 *buf, size_t bu
+ 	return ret;
+ }
+ 
+-static int asus_kbd_init(struct hid_device *hdev)
++int asus_kbd_get_report(struct hid_device *hdev, u8 *out_buf, size_t out_buf_size)
++{
++	return hid_hw_raw_request(hdev, FEATURE_KBD_REPORT_ID, out_buf,
++				 out_buf_size, HID_FEATURE_REPORT,
++				 HID_REQ_GET_REPORT);
++}
++
++static int asus_kbd_init(struct hid_device *hdev, u8 report_id)
+ {
+-	const u8 buf[] = { FEATURE_KBD_REPORT_ID, 0x41, 0x53, 0x55, 0x53, 0x20, 0x54,
++	const u8 buf[] = { report_id, 0x41, 0x53, 0x55, 0x53, 0x20, 0x54,
+ 		     0x65, 0x63, 0x68, 0x2e, 0x49, 0x6e, 0x63, 0x2e, 0x00 };
+ 	int ret;
+ 
+@@ -416,9 +383,10 @@ static int asus_kbd_init(struct hid_device *hdev)
+ }
+ 
+ static int asus_kbd_get_functions(struct hid_device *hdev,
+-				  unsigned char *kbd_func)
++				  unsigned char *kbd_func,
++				  u8 report_id)
+ {
+-	const u8 buf[] = { FEATURE_KBD_REPORT_ID, 0x05, 0x20, 0x31, 0x00, 0x08 };
++	const u8 buf[] = { report_id, 0x05, 0x20, 0x31, 0x00, 0x08 };
+ 	u8 *readbuf;
+ 	int ret;
+ 
+@@ -447,51 +415,6 @@ static int asus_kbd_get_functions(struct hid_device *hdev,
+ 	return ret;
+ }
+ 
+-static int rog_nkey_led_init(struct hid_device *hdev)
+-{
+-	const u8 buf_init_start[] = { FEATURE_KBD_LED_REPORT_ID1, 0xB9 };
+-	u8 buf_init2[] = { FEATURE_KBD_LED_REPORT_ID1, 0x41, 0x53, 0x55, 0x53, 0x20,
+-				0x54, 0x65, 0x63, 0x68, 0x2e, 0x49, 0x6e, 0x63, 0x2e, 0x00 };
+-	u8 buf_init3[] = { FEATURE_KBD_LED_REPORT_ID1,
+-						0x05, 0x20, 0x31, 0x00, 0x08 };
+-	int ret;
+-
+-	hid_info(hdev, "Asus initialise N-KEY Device");
+-	/* The first message is an init start */
+-	ret = asus_kbd_set_report(hdev, buf_init_start, sizeof(buf_init_start));
+-	if (ret < 0) {
+-		hid_warn(hdev, "Asus failed to send init start command: %d\n", ret);
+-		return ret;
+-	}
+-	/* Followed by a string */
+-	ret = asus_kbd_set_report(hdev, buf_init2, sizeof(buf_init2));
+-	if (ret < 0) {
+-		hid_warn(hdev, "Asus failed to send init command 1.0: %d\n", ret);
+-		return ret;
+-	}
+-	/* Followed by a string */
+-	ret = asus_kbd_set_report(hdev, buf_init3, sizeof(buf_init3));
+-	if (ret < 0) {
+-		hid_warn(hdev, "Asus failed to send init command 1.1: %d\n", ret);
+-		return ret;
+-	}
+-
+-	/* begin second report ID with same data */
+-	buf_init2[0] = FEATURE_KBD_LED_REPORT_ID2;
+-	buf_init3[0] = FEATURE_KBD_LED_REPORT_ID2;
+-
+-	ret = asus_kbd_set_report(hdev, buf_init2, sizeof(buf_init2));
+-	if (ret < 0) {
+-		hid_warn(hdev, "Asus failed to send init command 2.0: %d\n", ret);
+-		return ret;
+-	}
+-	ret = asus_kbd_set_report(hdev, buf_init3, sizeof(buf_init3));
+-	if (ret < 0)
+-		hid_warn(hdev, "Asus failed to send init command 2.1: %d\n", ret);
+-
+-	return ret;
+-}
+-
+ static void asus_schedule_work(struct asus_kbd_leds *led)
+ {
+ 	unsigned long flags;
+@@ -574,17 +497,27 @@ static int asus_kbd_register_leds(struct hid_device *hdev)
+ 	int ret;
+ 
+ 	if (drvdata->quirks & QUIRK_ROG_NKEY_KEYBOARD) {
+-		ret = rog_nkey_led_init(hdev);
++		/* Initialize keyboard */
++		ret = asus_kbd_init(hdev, FEATURE_KBD_REPORT_ID);
++		if (ret < 0)
++			return ret;
++
++		/* The LED endpoint is initialised in two HID */
++		ret = asus_kbd_init(hdev, FEATURE_KBD_LED_REPORT_ID1);
++		if (ret < 0)
++			return ret;
++
++		ret = asus_kbd_init(hdev, FEATURE_KBD_LED_REPORT_ID2);
+ 		if (ret < 0)
+ 			return ret;
+ 	} else {
+ 		/* Initialize keyboard */
+-		ret = asus_kbd_init(hdev);
++		ret = asus_kbd_init(hdev, FEATURE_KBD_REPORT_ID);
+ 		if (ret < 0)
+ 			return ret;
+ 
+ 		/* Get keyboard functions */
+-		ret = asus_kbd_get_functions(hdev, &kbd_func);
++		ret = asus_kbd_get_functions(hdev, &kbd_func, FEATURE_KBD_REPORT_ID);
+ 		if (ret < 0)
+ 			return ret;
+ 
+@@ -896,7 +829,11 @@ static int asus_input_mapping(struct hid_device *hdev,
+ 		case 0xb2: asus_map_key_clear(KEY_PROG2);	break; /* Fn+Left previous aura */
+ 		case 0xb3: asus_map_key_clear(KEY_PROG3);	break; /* Fn+Left next aura */
+ 		case 0x6a: asus_map_key_clear(KEY_F13);		break; /* Screenpad toggle */
+-		case 0x4b: asus_map_key_clear(KEY_F14);		break; /* Arrows/Pg-Up/Dn toggle */
++		case 0x4b: asus_map_key_clear(KEY_F14);		break; /* Arrows/Pg-Up/Dn toggle, Ally M1 */
++		case 0xa5: asus_map_key_clear(KEY_F15);		break; /* ROG Ally M2 */
++		case 0xa6: asus_map_key_clear(KEY_F16);		break; /* ROG Ally QAM button */
++		case 0xa7: asus_map_key_clear(KEY_F17);		break; /* ROG Ally ROG long-press */
++		case 0xa8: asus_map_key_clear(KEY_F18);		break; /* ROG Ally ROG long-press-release */
+ 
+ 
+ 		default:
+@@ -1109,6 +1046,10 @@ static int asus_probe(struct hid_device *hdev, const struct hid_device_id *id)
+ 		}
+ 	}
+ 
++	/* all ROG devices have this HID interface but we will focus on Ally for now */
++	if (drvdata->quirks & QUIRK_ROG_NKEY_KEYBOARD && hid_is_usb(hdev))
++		rog_ally.probe(hdev, &rog_ally);
++
+ 	ret = hid_parse(hdev);
+ 	if (ret) {
+ 		hid_err(hdev, "Asus hid parse failed: %d\n", ret);
+@@ -1158,6 +1099,8 @@ static void asus_remove(struct hid_device *hdev)
+ 		cancel_work_sync(&drvdata->kbd_backlight->work);
+ 	}
+ 
++	rog_ally.remove(hdev, &rog_ally);
++
+ 	hid_hw_stop(hdev);
+ }
+ 
+@@ -1250,6 +1193,19 @@ static __u8 *asus_report_fixup(struct hid_device *hdev, __u8 *rdesc,
+ 		rdesc[205] = 0x01;
+ 	}
+ 
++	/* match many more n-key devices */
++	if (drvdata->quirks & QUIRK_ROG_NKEY_KEYBOARD) {
++		for (int i = 0; i < *rsize + 1; i++) {
++			/* offset to the count from 0x5a report part always 14 */
++			if (rdesc[i] == 0x85 && rdesc[i + 1] == 0x5a &&
++			    rdesc[i + 14] == 0x95 && rdesc[i + 15] == 0x05) {
++				hid_info(hdev, "Fixing up Asus N-Key report descriptor\n");
++				rdesc[i + 15] = 0x01;
++				break;
++			}
++		}
++	}
++
+ 	return rdesc;
+ }
+ 
+@@ -1276,6 +1232,9 @@ static const struct hid_device_id asus_devices[] = {
+ 	{ HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK,
+ 	    USB_DEVICE_ID_ASUSTEK_ROG_NKEY_KEYBOARD3),
+ 	  QUIRK_USE_KBD_BACKLIGHT | QUIRK_ROG_NKEY_KEYBOARD },
++	{ HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK,
++	    USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY),
++	  QUIRK_USE_KBD_BACKLIGHT | QUIRK_ROG_NKEY_KEYBOARD | QUIRK_ROG_ALLY_XPAD },
+ 	{ HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK,
+ 	    USB_DEVICE_ID_ASUSTEK_ROG_CLAYMORE_II_KEYBOARD),
+ 	  QUIRK_ROG_CLAYMORE_II_KEYBOARD },
+diff --git a/drivers/hid/hid-asus-rog.c b/drivers/hid/hid-asus-rog.c
+new file mode 100644
+index 000000000000..584aefd0915a
+--- /dev/null
++++ b/drivers/hid/hid-asus-rog.c
+@@ -0,0 +1,1468 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ *  HID driver for Asus ROG laptops and Ally
++ *
++ *  Copyright (c) 2023 Luke Jones <luke@ljones.dev>
++ */
++
++#include <linux/hid.h>
++#include <linux/types.h>
++#include <linux/usb.h>
++
++#include "hid-asus.h"
++#include "hid-asus-rog.h"
++
++/* ROG Ally has many settings related to the gamepad, all using the same n-key endpoint */
++struct asus_rog_ally {
++	enum xpad_mode mode;
++	/*
++	 * index: [joysticks/triggers][left(2 bytes), right(2 bytes)]
++	 * joysticks: 2 bytes: inner, outer
++	 * triggers: 2 bytes: lower, upper
++	 * min/max: 0-64
++	 */
++	u8 deadzones[xpad_mode_mouse][2][4];
++	/*
++	 * index: left, right
++	 * max: 64
++	 */
++	u8 vibration_intensity[xpad_mode_mouse][2];
++	/*
++	 * index: [joysticks][2 byte stepping per point]
++	 * - 4 points of 2 bytes each
++	 * - byte 0 of pair = stick move %
++	 * - byte 1 of pair = stick response %
++	 * - min/max: 1-63
++	 */
++	bool supports_response_curves;
++	u8 response_curve[xpad_mode_mouse][2][8];
++	/*
++	 * left = byte 0, right = byte 1
++	 */
++	bool supports_anti_deadzones;
++	u8 anti_deadzones[xpad_mode_mouse][2];
++	/*
++	 * index: [mode][phys pair][b1, b1 secondary, b2, b2 secondary, blocks of 11]
++	*/
++	u8 key_mapping[xpad_mode_mouse][btn_pair_lt_rt][MAPPING_BLOCK_LEN];
++	/*
++	 *
++	*/
++	u8 turbo_btns[xpad_mode_mouse][TURBO_BLOCK_LEN];
++	/*
++	*/
++	u32 js_calibrations[2][6];
++	u32 tr_calibrations[2][2];
++};
++
++static struct asus_rog_ally *__rog_ally_data(struct device *raw_dev)
++{
++	struct hid_device *hdev = to_hid_device(raw_dev);
++	return ((struct asus_drvdata *)hid_get_drvdata(hdev))->rog_ally_data;
++}
++
++#define STR_TO_CODE_IF(_idx, _code, _label) \
++	if (!strcmp(buf, _label))           \
++		out[_idx] = _code;
++
++#define STR_TO_CODE_ELIF(_idx, _code, _label) else if (!strcmp(buf, _label)) out[_idx] = _code;
++
++/* writes the bytes for a requested key/function in to the out buffer */
++const static int __string_to_key_code(const char *buf, u8 *out, int out_len)
++{
++	u8 *save_buf;
++
++	if (out_len != BTN_CODE_LEN)
++		return -EINVAL;
++
++	save_buf = kzalloc(out_len, GFP_KERNEL);
++	if (!save_buf)
++		return -ENOMEM;
++	memcpy(save_buf, out, out_len);
++	memset(out, 0, out_len); // always clear before adjusting
++
++	// Allow clearing
++	if (!strcmp(buf, " ") || !strcmp(buf, "\n"))
++		goto success;
++
++	// set group xpad
++	out[0] = 0x01;
++	STR_TO_CODE_IF(1, 0x01, PAD_A)
++	STR_TO_CODE_ELIF(1, 0x02, PAD_B)
++	STR_TO_CODE_ELIF(1, 0x03, PAD_X)
++	STR_TO_CODE_ELIF(1, 0x04, PAD_Y)
++	STR_TO_CODE_ELIF(1, 0x05, PAD_LB)
++	STR_TO_CODE_ELIF(1, 0x06, PAD_RB)
++	STR_TO_CODE_ELIF(1, 0x07, PAD_LS)
++	STR_TO_CODE_ELIF(1, 0x08, PAD_RS)
++	STR_TO_CODE_ELIF(1, 0x09, PAD_DPAD_UP)
++	STR_TO_CODE_ELIF(1, 0x0a, PAD_DPAD_DOWN)
++	STR_TO_CODE_ELIF(1, 0x0b, PAD_DPAD_LEFT)
++	STR_TO_CODE_ELIF(1, 0x0c, PAD_DPAD_RIGHT)
++	STR_TO_CODE_ELIF(1, 0x11, PAD_VIEW)
++	STR_TO_CODE_ELIF(1, 0x12, PAD_MENU)
++	STR_TO_CODE_ELIF(1, 0x13, PAD_XBOX)
++	if (out[1])
++		goto success;
++
++	// set group keyboard
++	out[0] = 0x02;
++	STR_TO_CODE_IF(2, 0x8f, KB_M1)
++	STR_TO_CODE_ELIF(2, 0x8e, KB_M2)
++
++	STR_TO_CODE_ELIF(2, 0x76, KB_ESC)
++	STR_TO_CODE_ELIF(2, 0x50, KB_F1)
++	STR_TO_CODE_ELIF(2, 0x60, KB_F2)
++	STR_TO_CODE_ELIF(2, 0x40, KB_F3)
++	STR_TO_CODE_ELIF(2, 0x0c, KB_F4)
++	STR_TO_CODE_ELIF(2, 0x03, KB_F5)
++	STR_TO_CODE_ELIF(2, 0x0b, KB_F6)
++	STR_TO_CODE_ELIF(2, 0x80, KB_F7)
++	STR_TO_CODE_ELIF(2, 0x0a, KB_F8)
++	STR_TO_CODE_ELIF(2, 0x01, KB_F9)
++	STR_TO_CODE_ELIF(2, 0x09, KB_F10)
++	STR_TO_CODE_ELIF(2, 0x78, KB_F11)
++	STR_TO_CODE_ELIF(2, 0x07, KB_F12)
++	STR_TO_CODE_ELIF(2, 0x10, KB_F14)
++	STR_TO_CODE_ELIF(2, 0x18, KB_F15)
++
++	STR_TO_CODE_ELIF(2, 0x0e, KB_BACKTICK)
++	STR_TO_CODE_ELIF(2, 0x16, KB_1)
++	STR_TO_CODE_ELIF(2, 0x1e, KB_2)
++	STR_TO_CODE_ELIF(2, 0x26, KB_3)
++	STR_TO_CODE_ELIF(2, 0x25, KB_4)
++	STR_TO_CODE_ELIF(2, 0x2e, KB_5)
++	STR_TO_CODE_ELIF(2, 0x36, KB_6)
++	STR_TO_CODE_ELIF(2, 0x3d, KB_7)
++	STR_TO_CODE_ELIF(2, 0x3e, KB_8)
++	STR_TO_CODE_ELIF(2, 0x46, KB_9)
++	STR_TO_CODE_ELIF(2, 0x45, KB_0)
++	STR_TO_CODE_ELIF(2, 0x4e, KB_HYPHEN)
++	STR_TO_CODE_ELIF(2, 0x55, KB_EQUALS)
++	STR_TO_CODE_ELIF(2, 0x66, KB_BACKSPACE)
++
++	STR_TO_CODE_ELIF(2, 0x0d, KB_TAB)
++	STR_TO_CODE_ELIF(2, 0x15, KB_Q)
++	STR_TO_CODE_ELIF(2, 0x1d, KB_W)
++	STR_TO_CODE_ELIF(2, 0x24, KB_E)
++	STR_TO_CODE_ELIF(2, 0x2d, KB_R)
++	STR_TO_CODE_ELIF(2, 0x2d, KB_T)
++	STR_TO_CODE_ELIF(2, 0x35, KB_Y)
++	STR_TO_CODE_ELIF(2, 0x3c, KB_U)
++	STR_TO_CODE_ELIF(2, 0x43, KB_I)
++	STR_TO_CODE_ELIF(2, 0x44, KB_O)
++	STR_TO_CODE_ELIF(2, 0x4d, KB_P)
++	STR_TO_CODE_ELIF(2, 0x54, KB_LBRACKET)
++	STR_TO_CODE_ELIF(2, 0x5b, KB_RBRACKET)
++	STR_TO_CODE_ELIF(2, 0x5d, KB_BACKSLASH)
++
++	STR_TO_CODE_ELIF(2, 0x58, KB_CAPS)
++	STR_TO_CODE_ELIF(2, 0x1c, KB_A)
++	STR_TO_CODE_ELIF(2, 0x1b, KB_S)
++	STR_TO_CODE_ELIF(2, 0x23, KB_D)
++	STR_TO_CODE_ELIF(2, 0x2b, KB_F)
++	STR_TO_CODE_ELIF(2, 0x34, KB_G)
++	STR_TO_CODE_ELIF(2, 0x33, KB_H)
++	STR_TO_CODE_ELIF(2, 0x3b, KB_J)
++	STR_TO_CODE_ELIF(2, 0x42, KB_K)
++	STR_TO_CODE_ELIF(2, 0x4b, KB_L)
++	STR_TO_CODE_ELIF(2, 0x4c, KB_SEMI)
++	STR_TO_CODE_ELIF(2, 0x52, KB_QUOTE)
++	STR_TO_CODE_ELIF(2, 0x5a, KB_RET)
++
++	STR_TO_CODE_ELIF(2, 0x88, KB_LSHIFT)
++	STR_TO_CODE_ELIF(2, 0x1a, KB_Z)
++	STR_TO_CODE_ELIF(2, 0x22, KB_X)
++	STR_TO_CODE_ELIF(2, 0x21, KB_C)
++	STR_TO_CODE_ELIF(2, 0x2a, KB_V)
++	STR_TO_CODE_ELIF(2, 0x32, KB_B)
++	STR_TO_CODE_ELIF(2, 0x31, KB_N)
++	STR_TO_CODE_ELIF(2, 0x3a, KB_M)
++	STR_TO_CODE_ELIF(2, 0x41, KB_COMMA)
++	STR_TO_CODE_ELIF(2, 0x49, KB_PERIOD)
++	STR_TO_CODE_ELIF(2, 0x4a, KB_FWDSLASH)
++	STR_TO_CODE_ELIF(2, 0x89, KB_RSHIFT)
++
++	STR_TO_CODE_ELIF(2, 0x8c, KB_LCTL)
++	STR_TO_CODE_ELIF(2, 0x82, KB_META)
++	STR_TO_CODE_ELIF(2, 0xba, KB_LALT)
++	STR_TO_CODE_ELIF(2, 0x29, KB_SPACE)
++	STR_TO_CODE_ELIF(2, 0x8b, KB_RALT)
++	STR_TO_CODE_ELIF(2, 0x84, KB_MENU)
++	STR_TO_CODE_ELIF(2, 0x8d, KB_RCTL)
++
++	STR_TO_CODE_ELIF(2, 0xc3, KB_PRNTSCN)
++	STR_TO_CODE_ELIF(2, 0x7e, KB_SCRLCK)
++	STR_TO_CODE_ELIF(2, 0x91, KB_PAUSE)
++	STR_TO_CODE_ELIF(2, 0xc2, KB_INS)
++	STR_TO_CODE_ELIF(2, 0x94, KB_HOME)
++	STR_TO_CODE_ELIF(2, 0x96, KB_PGUP)
++	STR_TO_CODE_ELIF(2, 0xc0, KB_DEL)
++	STR_TO_CODE_ELIF(2, 0x95, KB_END)
++	STR_TO_CODE_ELIF(2, 0x97, KB_PGDWN)
++
++	STR_TO_CODE_ELIF(2, 0x99, KB_UP_ARROW)
++	STR_TO_CODE_ELIF(2, 0x98, KB_DOWN_ARROW)
++	STR_TO_CODE_ELIF(2, 0x91, KB_LEFT_ARROW)
++	STR_TO_CODE_ELIF(2, 0x9b, KB_RIGHT_ARROW)
++
++	STR_TO_CODE_ELIF(2, 0x77, NUMPAD_LOCK)
++	STR_TO_CODE_ELIF(2, 0x90, NUMPAD_FWDSLASH)
++	STR_TO_CODE_ELIF(2, 0x7c, NUMPAD_ASTERISK)
++	STR_TO_CODE_ELIF(2, 0x7b, NUMPAD_HYPHEN)
++	STR_TO_CODE_ELIF(2, 0x70, NUMPAD_0)
++	STR_TO_CODE_ELIF(2, 0x69, NUMPAD_1)
++	STR_TO_CODE_ELIF(2, 0x72, NUMPAD_2)
++	STR_TO_CODE_ELIF(2, 0x7a, NUMPAD_3)
++	STR_TO_CODE_ELIF(2, 0x6b, NUMPAD_4)
++	STR_TO_CODE_ELIF(2, 0x73, NUMPAD_5)
++	STR_TO_CODE_ELIF(2, 0x74, NUMPAD_6)
++	STR_TO_CODE_ELIF(2, 0x6c, NUMPAD_7)
++	STR_TO_CODE_ELIF(2, 0x75, NUMPAD_8)
++	STR_TO_CODE_ELIF(2, 0x7d, NUMPAD_9)
++	STR_TO_CODE_ELIF(2, 0x79, NUMPAD_PLUS)
++	STR_TO_CODE_ELIF(2, 0x81, NUMPAD_ENTER)
++	STR_TO_CODE_ELIF(2, 0x71, NUMPAD_PERIOD)
++	if (out[2])
++		goto success;
++
++	out[0] = 0x03;
++	STR_TO_CODE_IF(4, 0x01, RAT_LCLICK)
++	STR_TO_CODE_ELIF(4, 0x02, RAT_RCLICK)
++	STR_TO_CODE_ELIF(4, 0x03, RAT_MCLICK)
++	STR_TO_CODE_ELIF(4, 0x04, RAT_WHEEL_UP)
++	STR_TO_CODE_ELIF(4, 0x05, RAT_WHEEL_DOWN)
++	if (out[4] != 0)
++		goto success;
++
++	out[0] = 0x05;
++	STR_TO_CODE_IF(3, 0x16, MEDIA_SCREENSHOT)
++	STR_TO_CODE_ELIF(3, 0x19, MEDIA_SHOW_KEYBOARD)
++	STR_TO_CODE_ELIF(3, 0x1c, MEDIA_SHOW_DESKTOP)
++	STR_TO_CODE_ELIF(3, 0x1e, MEDIA_START_RECORDING)
++	STR_TO_CODE_ELIF(3, 0x01, MEDIA_MIC_OFF)
++	STR_TO_CODE_ELIF(3, 0x02, MEDIA_VOL_DOWN)
++	STR_TO_CODE_ELIF(3, 0x03, MEDIA_VOL_UP)
++	if (out[3])
++		goto success;
++
++	// restore bytes if invalid input
++	memcpy(out, save_buf, out_len);
++	kfree(save_buf);
++	return -EINVAL;
++
++success:
++	kfree(save_buf);
++	return 0;
++}
++
++#define CODE_TO_STR_IF(_idx, _code, _label) \
++	if (btn_block[_idx] == _code)       \
++		return _label;
++
++const static char *__btn_map_to_string(struct device *raw_dev, enum btn_pair pair,
++				       enum btn_pair_side side, bool secondary)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	u8 *btn_block;
++	int offs;
++
++	// TODO: this little block is common
++	offs = side ? MAPPING_BLOCK_LEN / 2 : 0;
++	offs = secondary ? offs + BTN_CODE_LEN : offs;
++	btn_block = rog_ally->key_mapping[rog_ally->mode - 1][pair - 1] + offs;
++
++	if (btn_block[0] == 0x01) {
++		CODE_TO_STR_IF(1, 0x01, PAD_A)
++		CODE_TO_STR_IF(1, 0x02, PAD_B)
++		CODE_TO_STR_IF(1, 0x03, PAD_X)
++		CODE_TO_STR_IF(1, 0x04, PAD_Y)
++		CODE_TO_STR_IF(1, 0x05, PAD_LB)
++		CODE_TO_STR_IF(1, 0x06, PAD_RB)
++		CODE_TO_STR_IF(1, 0x07, PAD_LS)
++		CODE_TO_STR_IF(1, 0x08, PAD_RS)
++		CODE_TO_STR_IF(1, 0x09, PAD_DPAD_UP)
++		CODE_TO_STR_IF(1, 0x0a, PAD_DPAD_DOWN)
++		CODE_TO_STR_IF(1, 0x0b, PAD_DPAD_LEFT)
++		CODE_TO_STR_IF(1, 0x0c, PAD_DPAD_RIGHT)
++		CODE_TO_STR_IF(1, 0x11, PAD_VIEW)
++		CODE_TO_STR_IF(1, 0x12, PAD_MENU)
++		CODE_TO_STR_IF(1, 0x13, PAD_XBOX)
++	}
++
++	if (btn_block[0] == 0x02) {
++		CODE_TO_STR_IF(2, 0x8f, KB_M1)
++		CODE_TO_STR_IF(2, 0x8e, KB_M2)
++		CODE_TO_STR_IF(2, 0x76, KB_ESC)
++		CODE_TO_STR_IF(2, 0x50, KB_F1)
++		CODE_TO_STR_IF(2, 0x60, KB_F2)
++		CODE_TO_STR_IF(2, 0x40, KB_F3)
++		CODE_TO_STR_IF(2, 0x0c, KB_F4)
++		CODE_TO_STR_IF(2, 0x03, KB_F5)
++		CODE_TO_STR_IF(2, 0x0b, KB_F6)
++		CODE_TO_STR_IF(2, 0x80, KB_F7)
++		CODE_TO_STR_IF(2, 0x0a, KB_F8)
++		CODE_TO_STR_IF(2, 0x01, KB_F9)
++		CODE_TO_STR_IF(2, 0x09, KB_F10)
++		CODE_TO_STR_IF(2, 0x78, KB_F11)
++		CODE_TO_STR_IF(2, 0x07, KB_F12)
++		CODE_TO_STR_IF(2, 0x10, KB_F14)
++		CODE_TO_STR_IF(2, 0x18, KB_F15)
++
++		CODE_TO_STR_IF(2, 0x0e, KB_BACKTICK)
++		CODE_TO_STR_IF(2, 0x16, KB_1)
++		CODE_TO_STR_IF(2, 0x1e, KB_2)
++		CODE_TO_STR_IF(2, 0x26, KB_3)
++		CODE_TO_STR_IF(2, 0x25, KB_4)
++		CODE_TO_STR_IF(2, 0x2e, KB_5)
++		CODE_TO_STR_IF(2, 0x36, KB_6)
++		CODE_TO_STR_IF(2, 0x3d, KB_7)
++		CODE_TO_STR_IF(2, 0x3e, KB_8)
++		CODE_TO_STR_IF(2, 0x46, KB_9)
++		CODE_TO_STR_IF(2, 0x45, KB_0)
++		CODE_TO_STR_IF(2, 0x4e, KB_HYPHEN)
++		CODE_TO_STR_IF(2, 0x55, KB_EQUALS)
++		CODE_TO_STR_IF(2, 0x66, KB_BACKSPACE)
++
++		CODE_TO_STR_IF(2, 0x0d, KB_TAB)
++		CODE_TO_STR_IF(2, 0x15, KB_Q)
++		CODE_TO_STR_IF(2, 0x1d, KB_W)
++		CODE_TO_STR_IF(2, 0x24, KB_E)
++		CODE_TO_STR_IF(2, 0x2d, KB_R)
++		CODE_TO_STR_IF(2, 0x2d, KB_T)
++		CODE_TO_STR_IF(2, 0x35, KB_Y)
++		CODE_TO_STR_IF(2, 0x3c, KB_U)
++		CODE_TO_STR_IF(2, 0x43, KB_I)
++		CODE_TO_STR_IF(2, 0x44, KB_O)
++		CODE_TO_STR_IF(2, 0x4d, KB_P)
++		CODE_TO_STR_IF(2, 0x54, KB_LBRACKET)
++		CODE_TO_STR_IF(2, 0x5b, KB_RBRACKET)
++		CODE_TO_STR_IF(2, 0x5d, KB_BACKSLASH)
++
++		CODE_TO_STR_IF(2, 0x58, KB_CAPS)
++		CODE_TO_STR_IF(2, 0x1c, KB_A)
++		CODE_TO_STR_IF(2, 0x1b, KB_S)
++		CODE_TO_STR_IF(2, 0x23, KB_D)
++		CODE_TO_STR_IF(2, 0x2b, KB_F)
++		CODE_TO_STR_IF(2, 0x34, KB_G)
++		CODE_TO_STR_IF(2, 0x33, KB_H)
++		CODE_TO_STR_IF(2, 0x3b, KB_J)
++		CODE_TO_STR_IF(2, 0x42, KB_K)
++		CODE_TO_STR_IF(2, 0x4b, KB_L)
++		CODE_TO_STR_IF(2, 0x4c, KB_SEMI)
++		CODE_TO_STR_IF(2, 0x52, KB_QUOTE)
++		CODE_TO_STR_IF(2, 0x5a, KB_RET)
++
++		CODE_TO_STR_IF(2, 0x88, KB_LSHIFT)
++		CODE_TO_STR_IF(2, 0x1a, KB_Z)
++		CODE_TO_STR_IF(2, 0x22, KB_X)
++		CODE_TO_STR_IF(2, 0x21, KB_C)
++		CODE_TO_STR_IF(2, 0x2a, KB_V)
++		CODE_TO_STR_IF(2, 0x32, KB_B)
++		CODE_TO_STR_IF(2, 0x31, KB_N)
++		CODE_TO_STR_IF(2, 0x3a, KB_M)
++		CODE_TO_STR_IF(2, 0x41, KB_COMMA)
++		CODE_TO_STR_IF(2, 0x49, KB_PERIOD)
++		CODE_TO_STR_IF(2, 0x4a, KB_FWDSLASH)
++		CODE_TO_STR_IF(2, 0x89, KB_RSHIFT)
++
++		CODE_TO_STR_IF(2, 0x8c, KB_LCTL)
++		CODE_TO_STR_IF(2, 0x82, KB_META)
++		CODE_TO_STR_IF(2, 0xba, KB_LALT)
++		CODE_TO_STR_IF(2, 0x29, KB_SPACE)
++		CODE_TO_STR_IF(2, 0x8b, KB_RALT)
++		CODE_TO_STR_IF(2, 0x84, KB_MENU)
++		CODE_TO_STR_IF(2, 0x8d, KB_RCTL)
++
++		CODE_TO_STR_IF(2, 0xc3, KB_PRNTSCN)
++		CODE_TO_STR_IF(2, 0x7e, KB_SCRLCK)
++		CODE_TO_STR_IF(2, 0x91, KB_PAUSE)
++		CODE_TO_STR_IF(2, 0xc2, KB_INS)
++		CODE_TO_STR_IF(2, 0x94, KB_HOME)
++		CODE_TO_STR_IF(2, 0x96, KB_PGUP)
++		CODE_TO_STR_IF(2, 0xc0, KB_DEL)
++		CODE_TO_STR_IF(2, 0x95, KB_END)
++		CODE_TO_STR_IF(2, 0x97, KB_PGDWN)
++
++		CODE_TO_STR_IF(2, 0x99, KB_UP_ARROW)
++		CODE_TO_STR_IF(2, 0x98, KB_DOWN_ARROW)
++		CODE_TO_STR_IF(2, 0x91, KB_LEFT_ARROW)
++		CODE_TO_STR_IF(2, 0x9b, KB_RIGHT_ARROW)
++
++		CODE_TO_STR_IF(2, 0x77, NUMPAD_LOCK)
++		CODE_TO_STR_IF(2, 0x90, NUMPAD_FWDSLASH)
++		CODE_TO_STR_IF(2, 0x7c, NUMPAD_ASTERISK)
++		CODE_TO_STR_IF(2, 0x7b, NUMPAD_HYPHEN)
++		CODE_TO_STR_IF(2, 0x70, NUMPAD_0)
++		CODE_TO_STR_IF(2, 0x69, NUMPAD_1)
++		CODE_TO_STR_IF(2, 0x72, NUMPAD_2)
++		CODE_TO_STR_IF(2, 0x7a, NUMPAD_3)
++		CODE_TO_STR_IF(2, 0x6b, NUMPAD_4)
++		CODE_TO_STR_IF(2, 0x73, NUMPAD_5)
++		CODE_TO_STR_IF(2, 0x74, NUMPAD_6)
++		CODE_TO_STR_IF(2, 0x6c, NUMPAD_7)
++		CODE_TO_STR_IF(2, 0x75, NUMPAD_8)
++		CODE_TO_STR_IF(2, 0x7d, NUMPAD_9)
++		CODE_TO_STR_IF(2, 0x79, NUMPAD_PLUS)
++		CODE_TO_STR_IF(2, 0x81, NUMPAD_ENTER)
++		CODE_TO_STR_IF(2, 0x71, NUMPAD_PERIOD)
++	}
++
++	if (btn_block[0] == 0x03) {
++		CODE_TO_STR_IF(4, 0x01, RAT_LCLICK)
++		CODE_TO_STR_IF(4, 0x02, RAT_RCLICK)
++		CODE_TO_STR_IF(4, 0x03, RAT_MCLICK)
++		CODE_TO_STR_IF(4, 0x04, RAT_WHEEL_UP)
++		CODE_TO_STR_IF(4, 0x05, RAT_WHEEL_DOWN)
++	}
++
++	if (btn_block[0] == 0x05) {
++		CODE_TO_STR_IF(3, 0x16, MEDIA_SCREENSHOT)
++		CODE_TO_STR_IF(3, 0x19, MEDIA_SHOW_KEYBOARD)
++		CODE_TO_STR_IF(3, 0x1c, MEDIA_SHOW_DESKTOP)
++		CODE_TO_STR_IF(3, 0x1e, MEDIA_START_RECORDING)
++		CODE_TO_STR_IF(3, 0x01, MEDIA_MIC_OFF)
++		CODE_TO_STR_IF(3, 0x02, MEDIA_VOL_DOWN)
++		CODE_TO_STR_IF(3, 0x03, MEDIA_VOL_UP)
++	}
++
++	return "";
++}
++
++/* ASUS ROG Ally device specific attributes */
++
++/* This should be called before any attempts to set device functions */
++static int __gamepad_check_ready(struct hid_device *hdev)
++{
++	u8 *hidbuf;
++	int ret, count;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	for (count = 0; count < 3; count++) {
++		hidbuf[0] = FEATURE_KBD_REPORT_ID;
++		hidbuf[1] = 0xD1;
++		hidbuf[2] = xpad_cmd_check_ready;
++		hidbuf[3] = 01;
++		ret = asus_kbd_set_report(hdev, hidbuf,
++					  FEATURE_ROG_ALLY_REPORT_SIZE);
++		if (ret < 0)
++			hid_warn(hdev, "ROG Ally check failed set report: %d\n", ret);
++
++		hidbuf[0] = hidbuf[1] = hidbuf[2] = hidbuf[3] = 0;
++		ret = asus_kbd_get_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++		if (ret < 0)
++			hid_warn(hdev, "ROG Ally check failed get report: %d\n", ret);
++
++		ret = hidbuf[2] == xpad_cmd_check_ready;
++		if (!ret)
++			hid_warn(hdev, "ROG Ally not ready, retry %d\n", count);
++		else
++			break;
++	}
++
++	if (count == 3)
++		hid_err(hdev, "ROG Ally never responded with a ready\n");
++
++	kfree(hidbuf);
++	return ret;
++}
++
++/********** BUTTON REMAPPING *********************************************************************/
++static void __btn_pair_to_pkt(struct device *raw_dev, enum btn_pair pair, u8 *out, int out_len)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++
++	out[0] = FEATURE_KBD_REPORT_ID;
++	out[1] = 0xD1;
++	out[2] = xpad_cmd_set_mapping;
++	out[3] = pair;
++	out[4] = 0x2c; //length
++	memcpy(&out[5], &rog_ally->key_mapping[rog_ally->mode - 1][pair - 1], MAPPING_BLOCK_LEN);
++}
++
++/* Store the button setting in driver data. Does not apply to device until __gamepad_set_mapping */
++static int __gamepad_mapping_store(struct device *raw_dev, const char *buf, enum btn_pair pair,
++				   int side, bool secondary)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	u8 *key_code;
++	int offs;
++
++	offs = side ? MAPPING_BLOCK_LEN / 2 : 0;
++	offs = secondary ? offs + BTN_CODE_LEN : offs;
++	key_code = rog_ally->key_mapping[rog_ally->mode - 1][pair - 1] + offs;
++
++	return __string_to_key_code(buf, key_code, BTN_CODE_LEN);
++}
++
++/* Apply the mapping pair to the device */
++static int __gamepad_set_mapping(struct device *raw_dev, enum btn_pair pair)
++{
++	struct hid_device *hdev = to_hid_device(raw_dev);
++	u8 *hidbuf;
++	int ret;
++
++	ret = __gamepad_check_ready(hdev);
++	if (ret < 0)
++		return ret;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	__btn_pair_to_pkt(raw_dev, pair, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	ret = asus_kbd_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	kfree(hidbuf);
++
++	return ret;
++}
++
++static ssize_t btn_mapping_apply_store(struct device *raw_dev, struct device_attribute *attr,
++				       const char *buf, size_t count)
++{
++	int ret = __gamepad_write_all_to_mcu(raw_dev);
++	if (ret < 0)
++		return ret;
++	return count;
++}
++ALLY_DEVICE_ATTR_WO(btn_mapping_apply, apply);
++
++/********** BUTTON TURBO *************************************************************************/
++static int __gamepad_turbo_index(enum btn_pair pair, int side)
++{
++	return (pair - 1) * (2 * TURBO_BLOCK_STEP) + (side * TURBO_BLOCK_STEP);
++};
++
++static int __gamepad_turbo_show(struct device *raw_dev, enum btn_pair pair, int side)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	return rog_ally->turbo_btns[rog_ally->mode - 1][__gamepad_turbo_index(pair, side)];
++};
++
++static int __gamepad_turbo_store(struct device *raw_dev, const char *buf, enum btn_pair pair,
++				 int side)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	int ret, val;
++
++	ret = kstrtoint(buf, 0, &val);
++	if (ret)
++		return ret;
++	if (val < 0 || val > 16)
++		return -EINVAL;
++
++	rog_ally->turbo_btns[rog_ally->mode - 1][__gamepad_turbo_index(pair, side)] = val;
++
++	return 0;
++};
++
++/* button map attributes, regular and macro*/
++ALLY_BTN_MAPPING(m2, btn_pair_m1_m2, btn_pair_side_left);
++ALLY_BTN_MAPPING(m1, btn_pair_m1_m2, btn_pair_side_right);
++ALLY_BTN_MAPPING(a, btn_pair_a_b, btn_pair_side_left);
++ALLY_BTN_MAPPING(b, btn_pair_a_b, btn_pair_side_right);
++ALLY_BTN_MAPPING(x, btn_pair_x_y, btn_pair_side_left);
++ALLY_BTN_MAPPING(y, btn_pair_x_y, btn_pair_side_right);
++ALLY_BTN_MAPPING(lb, btn_pair_lb_rb, btn_pair_side_left);
++ALLY_BTN_MAPPING(rb, btn_pair_lb_rb, btn_pair_side_right);
++ALLY_BTN_MAPPING(ls, btn_pair_ls_rs, btn_pair_side_left);
++ALLY_BTN_MAPPING(rs, btn_pair_ls_rs, btn_pair_side_right);
++ALLY_BTN_MAPPING(lt, btn_pair_lt_rt, btn_pair_side_left);
++ALLY_BTN_MAPPING(rt, btn_pair_lt_rt, btn_pair_side_right);
++ALLY_BTN_MAPPING(dpad_u, btn_pair_dpad_u_d, btn_pair_side_left);
++ALLY_BTN_MAPPING(dpad_d, btn_pair_dpad_u_d, btn_pair_side_right);
++ALLY_BTN_MAPPING(dpad_l, btn_pair_dpad_l_r, btn_pair_side_left);
++ALLY_BTN_MAPPING(dpad_r, btn_pair_dpad_l_r, btn_pair_side_right);
++ALLY_BTN_MAPPING(view, btn_pair_view_menu, btn_pair_side_left);
++ALLY_BTN_MAPPING(menu, btn_pair_view_menu, btn_pair_side_right);
++
++static void __gamepad_mapping_xpad_default(struct asus_rog_ally *rog_ally)
++{
++	memcpy(&rog_ally->key_mapping[0][0], &XPAD_DEF1, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[0][1], &XPAD_DEF2, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[0][2], &XPAD_DEF3, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[0][3], &XPAD_DEF4, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[0][4], &XPAD_DEF5, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[0][5], &XPAD_DEF6, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[0][6], &XPAD_DEF7, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[0][7], &XPAD_DEF8, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[0][8], &XPAD_DEF9, MAPPING_BLOCK_LEN);
++}
++
++static void __gamepad_mapping_wasd_default(struct asus_rog_ally *rog_ally)
++{
++	memcpy(&rog_ally->key_mapping[1][0], &WASD_DEF1, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[1][1], &WASD_DEF2, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[1][2], &WASD_DEF3, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[1][3], &WASD_DEF4, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[1][4], &WASD_DEF5, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[1][5], &WASD_DEF6, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[1][6], &WASD_DEF7, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[1][7], &WASD_DEF8, MAPPING_BLOCK_LEN);
++	memcpy(&rog_ally->key_mapping[1][8], &WASD_DEF9, MAPPING_BLOCK_LEN);
++}
++
++static ssize_t btn_mapping_reset_store(struct device *raw_dev, struct device_attribute *attr,
++				       const char *buf, size_t count)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	switch (rog_ally->mode) {
++	case xpad_mode_game:
++		__gamepad_mapping_xpad_default(rog_ally);
++		break;
++	case xpad_mode_wasd:
++		__gamepad_mapping_wasd_default(rog_ally);
++		break;
++	default:
++		__gamepad_mapping_xpad_default(rog_ally);
++		break;
++	}
++
++	return count;
++}
++
++ALLY_DEVICE_ATTR_WO(btn_mapping_reset, reset_btn_mapping);
++
++/********** GAMEPAD MODE *************************************************************************/
++static ssize_t __gamepad_set_mode(struct device *raw_dev, int val)
++{
++	struct hid_device *hdev = to_hid_device(raw_dev);
++	u8 *hidbuf;
++	int ret;
++
++	ret = __gamepad_check_ready(hdev);
++	if (ret < 0)
++		return ret;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	hidbuf[0] = FEATURE_KBD_REPORT_ID;
++	hidbuf[1] = 0xD1;
++	hidbuf[2] = xpad_cmd_set_mode;
++	hidbuf[3] = 0x01;
++	hidbuf[4] = val;
++
++	ret = __gamepad_check_ready(hdev);
++	if (ret < 0)
++		goto report_fail;
++
++	ret = asus_kbd_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto report_fail;
++
++	ret = __gamepad_write_all_to_mcu(raw_dev);
++	if (ret < 0)
++		goto report_fail;
++
++report_fail:
++	kfree(hidbuf);
++	return ret;
++}
++
++static ssize_t gamepad_mode_show(struct device *raw_dev, struct device_attribute *attr, char *buf)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	return sysfs_emit(buf, "%d\n", rog_ally->mode);
++}
++
++static ssize_t gamepad_mode_store(struct device *raw_dev, struct device_attribute *attr,
++				  const char *buf, size_t count)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	int ret, val;
++
++	ret = kstrtoint(buf, 0, &val);
++	if (ret)
++		return ret;
++
++	if (val < xpad_mode_game || val > xpad_mode_mouse)
++		return -EINVAL;
++
++	rog_ally->mode = val;
++
++	ret = __gamepad_set_mode(raw_dev, val);
++	if (ret < 0)
++		return ret;
++
++	return count;
++}
++
++DEVICE_ATTR_RW(gamepad_mode);
++
++/********** VIBRATION INTENSITY ******************************************************************/
++static ssize_t gamepad_vibration_intensity_index_show(struct device *raw_dev,
++						      struct device_attribute *attr, char *buf)
++{
++	return sysfs_emit(buf, "left right\n");
++}
++
++ALLY_DEVICE_ATTR_RO(gamepad_vibration_intensity_index, vibration_intensity_index);
++
++static ssize_t __gamepad_write_vibe_intensity_to_mcu(struct device *raw_dev)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	struct hid_device *hdev = to_hid_device(raw_dev);
++	u8 *hidbuf;
++	int ret;
++
++	ret = __gamepad_check_ready(hdev);
++	if (ret < 0)
++		return ret;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	hidbuf[0] = FEATURE_KBD_REPORT_ID;
++	hidbuf[1] = 0xD1;
++	hidbuf[2] = xpad_cmd_set_vibe_intensity;
++	hidbuf[3] = 0x02; // length
++	hidbuf[4] = rog_ally->vibration_intensity[rog_ally->mode - 1][btn_pair_side_left];
++	hidbuf[5] = rog_ally->vibration_intensity[rog_ally->mode - 1][btn_pair_side_right];
++
++	ret = __gamepad_check_ready(hdev);
++	if (ret < 0)
++		goto report_fail;
++
++	ret = asus_kbd_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto report_fail;
++
++report_fail:
++	kfree(hidbuf);
++	return ret;
++}
++
++static ssize_t gamepad_vibration_intensity_show(struct device *raw_dev,
++						struct device_attribute *attr, char *buf)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	return sysfs_emit(buf, "%d %d\n",
++			  rog_ally->vibration_intensity[rog_ally->mode - 1][btn_pair_side_left],
++			  rog_ally->vibration_intensity[rog_ally->mode - 1][btn_pair_side_right]);
++}
++
++static ssize_t gamepad_vibration_intensity_store(struct device *raw_dev,
++						 struct device_attribute *attr, const char *buf,
++						 size_t count)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	u32 left, right;
++	int ret;
++
++	if (sscanf(buf, "%d %d", &left, &right) != 2)
++		return -EINVAL;
++
++	if (left > 64 || right > 64)
++		return -EINVAL;
++
++	rog_ally->vibration_intensity[rog_ally->mode - 1][btn_pair_side_left] = left;
++	rog_ally->vibration_intensity[rog_ally->mode - 1][btn_pair_side_right] = right;
++
++	ret = __gamepad_write_vibe_intensity_to_mcu(raw_dev);
++	if (ret < 0)
++		return ret;
++
++	return count;
++}
++
++ALLY_DEVICE_ATTR_RW(gamepad_vibration_intensity, vibration_intensity);
++
++/********** ROOT LEVEL ATTRS **********************************************************************/
++static struct attribute *gamepad_device_attrs[] = { &dev_attr_gamepad_mode.attr,
++						    &dev_attr_btn_mapping_reset.attr,
++						    &dev_attr_btn_mapping_apply.attr,
++						    &dev_attr_gamepad_vibration_intensity.attr,
++						    &dev_attr_gamepad_vibration_intensity_index.attr,
++						    NULL };
++
++static const struct attribute_group ally_controller_attr_group = {
++	.attrs = gamepad_device_attrs,
++};
++
++/********** ANALOGUE DEADZONES ********************************************************************/
++static ssize_t __gamepad_set_deadzones(struct device *raw_dev)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	struct hid_device *hdev = to_hid_device(raw_dev);
++	u8 *hidbuf;
++	int ret;
++
++	ret = __gamepad_check_ready(hdev);
++	if (ret < 0)
++		return ret;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	hidbuf[0] = FEATURE_KBD_REPORT_ID;
++	hidbuf[1] = 0xD1;
++	hidbuf[2] = xpad_cmd_set_js_dz;
++	hidbuf[3] = 0x04; // length
++	hidbuf[4] = rog_ally->deadzones[rog_ally->mode - 1][0][0];
++	hidbuf[5] = rog_ally->deadzones[rog_ally->mode - 1][0][1];
++	hidbuf[6] = rog_ally->deadzones[rog_ally->mode - 1][0][2];
++	hidbuf[7] = rog_ally->deadzones[rog_ally->mode - 1][0][3];
++
++	ret = asus_kbd_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto end;
++
++	hidbuf[2] = xpad_cmd_set_tr_dz;
++	hidbuf[4] = rog_ally->deadzones[rog_ally->mode - 1][1][0];
++	hidbuf[5] = rog_ally->deadzones[rog_ally->mode - 1][1][1];
++	hidbuf[6] = rog_ally->deadzones[rog_ally->mode - 1][1][2];
++	hidbuf[7] = rog_ally->deadzones[rog_ally->mode - 1][1][3];
++
++	ret = asus_kbd_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto end;
++
++end:
++	kfree(hidbuf);
++	return ret;
++}
++
++static ssize_t __gamepad_store_deadzones(struct device *raw_dev, enum xpad_axis axis,
++					 const char *buf)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	int cmd, side, is_tr;
++	u32 inner, outer;
++
++	if (sscanf(buf, "%d %d", &inner, &outer) != 2)
++		return -EINVAL;
++
++	if (inner > 64 || outer > 64 || inner > outer)
++		return -EINVAL;
++
++	is_tr = axis > xpad_axis_xy_right;
++	side = axis == xpad_axis_xy_right || axis == xpad_axis_z_right ? 2 : 0;
++	cmd = is_tr ? xpad_cmd_set_js_dz : xpad_cmd_set_tr_dz;
++
++	rog_ally->deadzones[rog_ally->mode - 1][is_tr][side] = inner;
++	rog_ally->deadzones[rog_ally->mode - 1][is_tr][side + 1] = outer;
++
++	return 0;
++}
++
++static ssize_t axis_xyz_deadzone_index_show(struct device *raw_dev, struct device_attribute *attr,
++					    char *buf)
++{
++	return sysfs_emit(buf, "inner outer\n");
++}
++
++ALLY_DEVICE_ATTR_RO(axis_xyz_deadzone_index, deadzone_index);
++
++ALLY_AXIS_DEADZONE(xpad_axis_xy_left, deadzone);
++ALLY_AXIS_DEADZONE(xpad_axis_xy_right, deadzone);
++ALLY_AXIS_DEADZONE(xpad_axis_z_left, deadzone);
++ALLY_AXIS_DEADZONE(xpad_axis_z_right, deadzone);
++
++/********** ANTI-DEADZONES ***********************************************************************/
++static ssize_t __gamepad_write_js_ADZ_to_mcu(struct device *raw_dev)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	struct hid_device *hdev = to_hid_device(raw_dev);
++	u8 *hidbuf;
++	int ret;
++
++	ret = __gamepad_check_ready(hdev);
++	if (ret < 0)
++		return ret;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	hidbuf[0] = FEATURE_KBD_REPORT_ID;
++	hidbuf[1] = 0xD1;
++	hidbuf[2] = xpad_cmd_set_adz;
++	hidbuf[3] = 0x02; // length
++	hidbuf[4] = rog_ally->anti_deadzones[rog_ally->mode - 1][btn_pair_side_left];
++	hidbuf[5] = rog_ally->anti_deadzones[rog_ally->mode - 1][btn_pair_side_right];
++
++	ret = __gamepad_check_ready(hdev);
++	if (ret < 0)
++		goto report_fail;
++
++	ret = asus_kbd_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto report_fail;
++
++report_fail:
++	kfree(hidbuf);
++	return ret;
++}
++
++static ssize_t __gamepad_js_ADZ_store(struct device *raw_dev, const char *buf,
++				      enum btn_pair_side side)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	int ret, val;
++
++	ret = kstrtoint(buf, 0, &val);
++	if (ret)
++		return ret;
++
++	if (val < 0 || val > 32)
++		return -EINVAL;
++
++	rog_ally->anti_deadzones[rog_ally->mode - 1][side] = val;
++
++	return ret;
++}
++
++static ssize_t xpad_axis_xy_left_ADZ_show(struct device *raw_dev, struct device_attribute *attr,
++					  char *buf)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	return sysfs_emit(buf, "%d\n",
++			  rog_ally->anti_deadzones[rog_ally->mode - 1][btn_pair_side_left]);
++}
++
++static ssize_t xpad_axis_xy_left_ADZ_store(struct device *raw_dev, struct device_attribute *attr,
++					   const char *buf, size_t count)
++{
++	int ret = __gamepad_js_ADZ_store(raw_dev, buf, btn_pair_side_left);
++	if (ret)
++		return ret;
++
++	return count;
++}
++
++ALLY_DEVICE_ATTR_RW(xpad_axis_xy_left_ADZ, anti_deadzone);
++
++static ssize_t xpad_axis_xy_right_ADZ_show(struct device *raw_dev, struct device_attribute *attr,
++					   char *buf)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	return sysfs_emit(buf, "%d\n",
++			  rog_ally->anti_deadzones[rog_ally->mode - 1][btn_pair_side_right]);
++}
++
++static ssize_t xpad_axis_xy_right_ADZ_store(struct device *raw_dev, struct device_attribute *attr,
++					    const char *buf, size_t count)
++{
++	int ret = __gamepad_js_ADZ_store(raw_dev, buf, btn_pair_side_right);
++	if (ret)
++		return ret;
++
++	return count;
++}
++
++ALLY_DEVICE_ATTR_RW(xpad_axis_xy_right_ADZ, anti_deadzone);
++
++/********** JS RESPONSE CURVES *******************************************************************/
++static ssize_t rc_point_index_show(struct device *raw_dev, struct device_attribute *attr, char *buf)
++{
++	return sysfs_emit(buf, "move response\n");
++}
++
++ALLY_DEVICE_ATTR_RO(rc_point_index, rc_point_index);
++
++static ssize_t __gamepad_write_response_curves_to_mcu(struct device *raw_dev)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	struct hid_device *hdev = to_hid_device(raw_dev);
++	u8 *hidbuf;
++	int ret;
++
++	ret = __gamepad_check_ready(hdev);
++	if (ret < 0)
++		return ret;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	hidbuf[0] = FEATURE_KBD_REPORT_ID;
++	hidbuf[1] = 0xD1;
++	hidbuf[2] = xpad_cmd_set_response_curve;
++	hidbuf[3] = 0x09; // length
++	hidbuf[4] = 0x01;
++	memcpy(&hidbuf[5], &rog_ally->response_curve[rog_ally->mode - 1][btn_pair_side_left], 8);
++
++	ret = __gamepad_check_ready(hdev);
++	if (ret < 0)
++		goto report_fail;
++
++	hidbuf[4] = 0x02;
++	memcpy(&hidbuf[5], &rog_ally->response_curve[rog_ally->mode - 1][btn_pair_side_right], 8);
++
++	ret = __gamepad_check_ready(hdev);
++	if (ret < 0)
++		goto report_fail;
++
++	ret = asus_kbd_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto report_fail;
++
++report_fail:
++	kfree(hidbuf);
++	return ret;
++}
++
++static ssize_t __gamepad_store_response_curve(struct device *raw_dev, const char *buf,
++					      enum btn_pair_side side, int point)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	int idx = (point - 1) * 2;
++	u32 move, response;
++
++	if (sscanf(buf, "%d %d", &move, &response) != 2)
++		return -EINVAL;
++
++	if (move > 64 || response > 64)
++		return -EINVAL;
++
++	rog_ally->response_curve[rog_ally->mode - 1][side][idx] = move;
++	rog_ally->response_curve[rog_ally->mode - 1][side][idx + 1] = response;
++
++	return 0;
++}
++
++ALLY_JS_RC_POINT(left, 1, rc_point_);
++ALLY_JS_RC_POINT(left, 2, rc_point_);
++ALLY_JS_RC_POINT(left, 3, rc_point_);
++ALLY_JS_RC_POINT(left, 4, rc_point_);
++
++ALLY_JS_RC_POINT(right, 1, rc_point_);
++ALLY_JS_RC_POINT(right, 2, rc_point_);
++ALLY_JS_RC_POINT(right, 3, rc_point_);
++ALLY_JS_RC_POINT(right, 4, rc_point_);
++
++/********** CALIBRATIONS *************************************************************************/
++static ssize_t __gamepad_write_cal_to_mcu(struct device *raw_dev, enum xpad_axis axis)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	struct hid_device *hdev = to_hid_device(raw_dev);
++	u8 *hidbuf;
++	u8 *c, side, pkt_len, data_len;
++	int ret, cal, checksum = 0;
++
++	ret = __gamepad_check_ready(hdev);
++	if (ret < 0)
++		return ret;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	side = axis == xpad_axis_xy_right || axis == xpad_axis_z_right ? 1 : 0;
++	pkt_len = axis > xpad_axis_xy_right ? 0x06 : 0x0E;
++	data_len = axis > xpad_axis_xy_right ? 2 : 6;
++
++	hidbuf[0] = FEATURE_KBD_REPORT_ID;
++	hidbuf[1] = 0xD1;
++	hidbuf[2] = xpad_cmd_set_calibration;
++	hidbuf[3] = pkt_len;
++	hidbuf[4] = 0x01; // second command (set)
++	hidbuf[5] = axis;
++	c = &hidbuf[6]; // pointer
++
++	for (size_t i = 0; i < data_len; i++) {
++		cal = rog_ally->js_calibrations[side][i];
++		*c = (u8)((cal & 0xff00) >> 8);
++		checksum += *c;
++		c += 1;
++		*c = (u8)(cal & 0xff);
++		checksum += *c;
++		c += 1;
++	}
++
++	hidbuf[6 + data_len * 2] = checksum;
++
++	// TODO: debug if
++	printk("CAL: ");
++	for (size_t i = 0; i < 19; i++) {
++		printk(KERN_CONT "%02x,", hidbuf[i]);
++	}
++
++	ret = asus_kbd_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto report_fail;
++
++	memset(hidbuf, 0, FEATURE_ROG_ALLY_REPORT_SIZE);
++	hidbuf[0] = FEATURE_KBD_REPORT_ID;
++	hidbuf[1] = 0xD1;
++	hidbuf[2] = xpad_cmd_set_calibration;
++	hidbuf[3] = 0x01; // pkt len
++	hidbuf[4] = 0x03; // second command (set)
++
++	ret = asus_kbd_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto report_fail;
++
++report_fail:
++	kfree(hidbuf);
++	return ret;
++}
++
++static ssize_t __gamepad_cal_store(struct device *raw_dev, const char *buf, enum xpad_axis axis)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	u32 x_stable, x_min, x_max, y_stable, y_min, y_max, side;
++
++	if (axis == xpad_axis_xy_left || axis == xpad_axis_xy_right) {
++		if (sscanf(buf, "%d %d %d %d %d %d", &x_stable, &x_min, &x_max, &y_stable, &y_min,
++			   &y_max) != 6)
++			return -EINVAL;
++		//TODO: validate input
++
++		side = axis == xpad_axis_xy_right || axis == xpad_axis_z_right ? 1 : 0;
++		/* stored in reverse order for easy copy to packet */
++		rog_ally->js_calibrations[side][0] = y_stable;
++		rog_ally->js_calibrations[side][1] = y_min;
++		rog_ally->js_calibrations[side][2] = y_max;
++		rog_ally->js_calibrations[side][3] = x_stable;
++		rog_ally->js_calibrations[side][4] = x_min;
++		rog_ally->js_calibrations[side][5] = x_max;
++
++		return __gamepad_write_cal_to_mcu(raw_dev, axis);
++	} else {
++		if (sscanf(buf, "%d %d", &x_stable, &x_max) != 2)
++			return -EINVAL;
++		//TODO: validate input
++
++		side = axis == xpad_axis_xy_right || axis == xpad_axis_z_right ? 1 : 0;
++		/* stored in reverse order for easy copy to packet */
++		rog_ally->tr_calibrations[side][0] = x_stable;
++		rog_ally->tr_calibrations[side][1] = x_max;
++
++		return __gamepad_write_cal_to_mcu(raw_dev, axis);
++	}
++}
++
++static ssize_t __gamepad_cal_show(struct device *raw_dev, char *buf, enum xpad_axis axis)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	int side = axis == xpad_axis_xy_right || axis == xpad_axis_z_right ? 1 : 0;
++
++	if (axis == xpad_axis_xy_left || axis == xpad_axis_xy_right) {
++		return sysfs_emit(buf, "%d %d %d %d %d %d\n", rog_ally->js_calibrations[side][3],
++				  rog_ally->js_calibrations[side][4],
++				  rog_ally->js_calibrations[side][5],
++				  rog_ally->js_calibrations[side][0],
++				  rog_ally->js_calibrations[side][1],
++				  rog_ally->js_calibrations[side][2]);
++	} else {
++		return sysfs_emit(buf, "%d %d\n", rog_ally->tr_calibrations[side][0],
++				  rog_ally->tr_calibrations[side][1]);
++	}
++}
++
++ALLY_CAL_ATTR(xpad_axis_xy_left_cal, xpad_axis_xy_left, calibration);
++ALLY_CAL_ATTR(xpad_axis_xy_right_cal, xpad_axis_xy_right, calibration);
++ALLY_CAL_ATTR(xpad_axis_z_left_cal, xpad_axis_z_left, calibration);
++ALLY_CAL_ATTR(xpad_axis_z_right_cal, xpad_axis_z_right, calibration);
++
++static ssize_t xpad_axis_xy_cal_index_show(struct device *raw_dev, struct device_attribute *attr,
++					   char *buf)
++{
++	return sysfs_emit(buf, "x_stable x_min x_max y_stable y_min y_max\n");
++}
++
++ALLY_DEVICE_ATTR_RO(xpad_axis_xy_cal_index, calibration_index);
++
++static ssize_t xpad_axis_z_cal_index_show(struct device *raw_dev, struct device_attribute *attr,
++					  char *buf)
++{
++	return sysfs_emit(buf, "z_stable z_max\n");
++}
++
++ALLY_DEVICE_ATTR_RO(xpad_axis_z_cal_index, calibration_index);
++
++static ssize_t __gamepad_cal_reset(struct device *raw_dev, const char *buf, enum xpad_axis axis)
++{
++	struct hid_device *hdev = to_hid_device(raw_dev);
++	u8 *hidbuf;
++	u8 side;
++	int ret;
++
++	ret = __gamepad_check_ready(hdev);
++	if (ret < 0)
++		return ret;
++
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++
++	side = axis == xpad_axis_xy_right || axis == xpad_axis_z_right ? 1 : 0;
++
++	hidbuf[0] = FEATURE_KBD_REPORT_ID;
++	hidbuf[1] = 0xD1;
++	hidbuf[2] = xpad_cmd_set_calibration;
++	hidbuf[3] = 0x02; // pkt len
++	hidbuf[4] = 0x02; // second command (reset)
++	hidbuf[5] = axis;
++
++	ret = asus_kbd_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto report_fail;
++
++	memset(hidbuf, 0, FEATURE_ROG_ALLY_REPORT_SIZE);
++	hidbuf[0] = FEATURE_KBD_REPORT_ID;
++	hidbuf[1] = 0xD1;
++	hidbuf[2] = xpad_cmd_set_calibration;
++	hidbuf[3] = 0x01; // pkt len
++	hidbuf[4] = 0x03; // second command (set)
++
++	ret = asus_kbd_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++	if (ret < 0)
++		goto report_fail;
++
++report_fail:
++	kfree(hidbuf);
++	return ret;
++}
++
++ALLY_CAL_RESET_ATTR(xpad_axis_xy_left_cal_reset, xpad_axis_xy_left, calibration_reset);
++ALLY_CAL_RESET_ATTR(xpad_axis_xy_right_cal_reset, xpad_axis_xy_right, calibration_reset);
++ALLY_CAL_RESET_ATTR(xpad_axis_z_left_cal_reset, xpad_axis_z_left, calibration_reset);
++ALLY_CAL_RESET_ATTR(xpad_axis_z_right_cal_reset, xpad_axis_z_right, calibration_reset);
++
++static struct attribute *gamepad_axis_xy_left_attrs[] = { &dev_attr_xpad_axis_xy_left_deadzone.attr,
++							  &dev_attr_axis_xyz_deadzone_index.attr,
++							  &dev_attr_xpad_axis_xy_left_ADZ.attr,
++							  &dev_attr_xpad_axis_xy_left_cal_reset.attr,
++							  &dev_attr_xpad_axis_xy_left_cal.attr,
++							  &dev_attr_xpad_axis_xy_cal_index.attr,
++							  &dev_attr_rc_point_left_1.attr,
++							  &dev_attr_rc_point_left_2.attr,
++							  &dev_attr_rc_point_left_3.attr,
++							  &dev_attr_rc_point_left_4.attr,
++							  &dev_attr_rc_point_index.attr,
++							  NULL };
++static const struct attribute_group ally_controller_axis_xy_left_attr_group = {
++	.name = "axis_xy_left",
++	.attrs = gamepad_axis_xy_left_attrs,
++};
++
++static struct attribute *gamepad_axis_xy_right_attrs[] = {
++	&dev_attr_xpad_axis_xy_right_deadzone.attr,
++	&dev_attr_axis_xyz_deadzone_index.attr,
++	&dev_attr_xpad_axis_xy_right_ADZ.attr,
++	&dev_attr_xpad_axis_xy_right_cal_reset.attr,
++	&dev_attr_xpad_axis_xy_right_cal.attr,
++	&dev_attr_xpad_axis_xy_cal_index.attr,
++	&dev_attr_rc_point_right_1.attr,
++	&dev_attr_rc_point_right_2.attr,
++	&dev_attr_rc_point_right_3.attr,
++	&dev_attr_rc_point_right_4.attr,
++	&dev_attr_rc_point_index.attr,
++	NULL
++};
++static const struct attribute_group ally_controller_axis_xy_right_attr_group = {
++	.name = "axis_xy_right",
++	.attrs = gamepad_axis_xy_right_attrs,
++};
++
++static struct attribute *gamepad_axis_z_left_attrs[] = {
++	&dev_attr_xpad_axis_z_left_deadzone.attr,  &dev_attr_axis_xyz_deadzone_index.attr,
++	&dev_attr_xpad_axis_z_left_cal.attr,	   &dev_attr_xpad_axis_z_cal_index.attr,
++	&dev_attr_xpad_axis_z_left_cal_reset.attr, NULL
++};
++static const struct attribute_group ally_controller_axis_z_left_attr_group = {
++	.name = "axis_z_left",
++	.attrs = gamepad_axis_z_left_attrs,
++};
++
++static struct attribute *gamepad_axis_z_right_attrs[] = {
++	&dev_attr_xpad_axis_z_right_deadzone.attr,  &dev_attr_axis_xyz_deadzone_index.attr,
++	&dev_attr_xpad_axis_z_right_cal.attr,	    &dev_attr_xpad_axis_z_cal_index.attr,
++	&dev_attr_xpad_axis_z_right_cal_reset.attr, NULL
++};
++static const struct attribute_group ally_controller_axis_z_right_attr_group = {
++	.name = "axis_z_right",
++	.attrs = gamepad_axis_z_right_attrs,
++};
++
++static const struct attribute_group *gamepad_device_attr_groups[] = {
++	&ally_controller_attr_group,
++	&ally_controller_axis_xy_left_attr_group,
++	&ally_controller_axis_xy_right_attr_group,
++	&ally_controller_axis_z_left_attr_group,
++	&ally_controller_axis_z_right_attr_group,
++	&btn_mapping_m1_attr_group,
++	&btn_mapping_m2_attr_group,
++	&btn_mapping_a_attr_group,
++	&btn_mapping_b_attr_group,
++	&btn_mapping_x_attr_group,
++	&btn_mapping_y_attr_group,
++	&btn_mapping_lb_attr_group,
++	&btn_mapping_rb_attr_group,
++	&btn_mapping_ls_attr_group,
++	&btn_mapping_rs_attr_group,
++	&btn_mapping_dpad_u_attr_group,
++	&btn_mapping_dpad_d_attr_group,
++	&btn_mapping_dpad_l_attr_group,
++	&btn_mapping_dpad_r_attr_group,
++	&btn_mapping_view_attr_group,
++	&btn_mapping_menu_attr_group,
++	NULL
++};
++
++static int __gamepad_write_all_to_mcu(struct device *raw_dev)
++{
++	struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);
++	struct hid_device *hdev = to_hid_device(raw_dev);
++	u8 *hidbuf;
++	int ret = 0;
++
++	ret = __gamepad_set_mapping(&hdev->dev, btn_pair_dpad_u_d);
++	if (ret < 0)
++		return ret;
++	ret = __gamepad_set_mapping(&hdev->dev, btn_pair_dpad_l_r);
++	if (ret < 0)
++		return ret;
++	ret = __gamepad_set_mapping(&hdev->dev, btn_pair_ls_rs);
++	if (ret < 0)
++		return ret;
++	ret = __gamepad_set_mapping(&hdev->dev, btn_pair_lb_rb);
++	if (ret < 0)
++		return ret;
++	ret = __gamepad_set_mapping(&hdev->dev, btn_pair_a_b);
++	if (ret < 0)
++		return ret;
++	ret = __gamepad_set_mapping(&hdev->dev, btn_pair_x_y);
++	if (ret < 0)
++		return ret;
++	ret = __gamepad_set_mapping(&hdev->dev, btn_pair_view_menu);
++	if (ret < 0)
++		return ret;
++	ret = __gamepad_set_mapping(&hdev->dev, btn_pair_m1_m2);
++	if (ret < 0)
++		return ret;
++	__gamepad_set_mapping(&hdev->dev, btn_pair_lt_rt);
++	if (ret < 0)
++		return ret;
++	__gamepad_set_deadzones(raw_dev);
++	if (ret < 0)
++		return ret;
++	__gamepad_write_js_ADZ_to_mcu(raw_dev);
++	if (ret < 0)
++		return ret;
++	__gamepad_write_vibe_intensity_to_mcu(raw_dev);
++	if (ret < 0)
++		return ret;
++	__gamepad_write_response_curves_to_mcu(raw_dev);
++	if (ret < 0)
++		return ret;
++
++	ret = __gamepad_check_ready(hdev);
++	if (ret < 0)
++		return ret;
++
++	/* set turbo */
++	hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL);
++	if (!hidbuf)
++		return -ENOMEM;
++	hidbuf[0] = FEATURE_KBD_REPORT_ID;
++	hidbuf[1] = 0xD1;
++	hidbuf[2] = xpad_cmd_set_turbo;
++	hidbuf[3] = 0x20; // length
++	memcpy(&hidbuf[4], rog_ally->turbo_btns[rog_ally->mode - 1], TURBO_BLOCK_LEN);
++	ret = asus_kbd_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE);
++
++	kfree(hidbuf);
++	return ret;
++}
++
++static int asus_rog_ally_probe(struct hid_device *hdev, const struct rog_ops *ops)
++{
++	struct asus_drvdata *drvdata = hid_get_drvdata(hdev);
++	int ret = 0;
++
++	/* all ROG devices have this HID interface but we will focus on Ally for now */
++	if (drvdata->quirks & QUIRK_ROG_NKEY_KEYBOARD && hid_is_usb(hdev)) {
++		struct usb_interface *intf = to_usb_interface(hdev->dev.parent);
++
++		if (intf->altsetting->desc.bInterfaceNumber == 0) {
++			hid_info(hdev, "Setting up ROG USB interface\n");
++			/* initialise and set up USB, common to ROG */
++			// TODO:
++
++			/* initialise the Ally data */
++			if (drvdata->quirks & QUIRK_ROG_ALLY_XPAD) {
++				hid_info(hdev, "Setting up ROG Ally interface\n");
++
++				drvdata->rog_ally_data = devm_kzalloc(
++					&hdev->dev, sizeof(*drvdata->rog_ally_data), GFP_KERNEL);
++				if (!drvdata->rog_ally_data) {
++					hid_err(hdev, "Can't alloc Asus ROG USB interface\n");
++					ret = -ENOMEM;
++					goto err_stop_hw;
++				}
++				// TODO: move these to functions
++				drvdata->rog_ally_data->mode = xpad_mode_game;
++				for (int i = 0; i < xpad_mode_mouse; i++) {
++					drvdata->rog_ally_data->deadzones[i][0][1] = 64;
++					drvdata->rog_ally_data->deadzones[i][0][3] = 64;
++					drvdata->rog_ally_data->deadzones[i][1][1] = 64;
++					drvdata->rog_ally_data->deadzones[i][1][3] = 64;
++
++					drvdata->rog_ally_data->response_curve[i][0][0] = 0x14;
++					drvdata->rog_ally_data->response_curve[i][0][1] = 0x14;
++					drvdata->rog_ally_data->response_curve[i][0][2] = 0x28;
++					drvdata->rog_ally_data->response_curve[i][0][3] = 0x28;
++					drvdata->rog_ally_data->response_curve[i][0][4] = 0x3c;
++					drvdata->rog_ally_data->response_curve[i][0][5] = 0x3c;
++					drvdata->rog_ally_data->response_curve[i][0][6] = 0x50;
++					drvdata->rog_ally_data->response_curve[i][0][7] = 0x50;
++
++					drvdata->rog_ally_data->response_curve[i][1][0] = 0x14;
++					drvdata->rog_ally_data->response_curve[i][1][1] = 0x14;
++					drvdata->rog_ally_data->response_curve[i][1][2] = 0x28;
++					drvdata->rog_ally_data->response_curve[i][1][3] = 0x28;
++					drvdata->rog_ally_data->response_curve[i][1][4] = 0x3c;
++					drvdata->rog_ally_data->response_curve[i][1][5] = 0x3c;
++					drvdata->rog_ally_data->response_curve[i][1][6] = 0x50;
++					drvdata->rog_ally_data->response_curve[i][1][7] = 0x50;
++
++					drvdata->rog_ally_data->vibration_intensity[i][0] = 64;
++					drvdata->rog_ally_data->vibration_intensity[i][1] = 64;
++				}
++
++				/* ignore all errors for this as they are related to USB HID I/O */
++				__gamepad_mapping_xpad_default(drvdata->rog_ally_data);
++				__gamepad_mapping_wasd_default(drvdata->rog_ally_data);
++				// these calls will never error so ignore the return
++				__gamepad_mapping_store(&hdev->dev, "kb_f14", btn_pair_m1_m2,
++							btn_pair_side_left, false); // M2
++				__gamepad_mapping_store(&hdev->dev, "kb_f15", btn_pair_m1_m2,
++							btn_pair_side_right, false); // M1
++				__gamepad_set_mapping(&hdev->dev, btn_pair_m1_m2);
++				__gamepad_set_mode(&hdev->dev, xpad_mode_game);
++			}
++
++			if (sysfs_create_groups(&hdev->dev.kobj, gamepad_device_attr_groups))
++				goto err_stop_hw;
++		}
++	}
++
++	return 0;
++err_stop_hw:
++	hid_hw_stop(hdev);
++	return ret;
++}
++
++void asus_rog_ally_remove(struct hid_device *hdev, const struct rog_ops *ops)
++{
++	struct asus_drvdata *drvdata = hid_get_drvdata(hdev);
++	if (drvdata->rog_ally_data) {
++		__gamepad_set_mode(&hdev->dev, xpad_mode_mouse);
++		sysfs_remove_groups(&hdev->dev.kobj, gamepad_device_attr_groups);
++	}
++}
++
++const struct rog_ops rog_ally = {
++	.probe = asus_rog_ally_probe,
++	.remove = asus_rog_ally_remove,
++};
+diff --git a/drivers/hid/hid-asus-rog.h b/drivers/hid/hid-asus-rog.h
+new file mode 100644
+index 000000000000..efad0b041d5d
+--- /dev/null
++++ b/drivers/hid/hid-asus-rog.h
+@@ -0,0 +1,482 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ *  HID driver for Asus ROG laptops and Ally
++ *
++ *  Copyright (c) 2023 Luke Jones <luke@ljones.dev>
++ */
++
++/* data that is private to the hid-asus-rog module */
++
++#include <linux/hid.h>
++#include <linux/types.h>
++
++#define BTN_CODE_LEN 11
++#define MAPPING_BLOCK_LEN 44
++
++#define TURBO_BLOCK_LEN 32
++#define TURBO_BLOCK_STEP 2
++
++#define PAD_A "pad_a"
++#define PAD_B "pad_b"
++#define PAD_X "pad_x"
++#define PAD_Y "pad_y"
++#define PAD_LB "pad_lb"
++#define PAD_RB "pad_rb"
++#define PAD_LS "pad_ls"
++#define PAD_RS "pad_rs"
++#define PAD_DPAD_UP "pad_dpad_up"
++#define PAD_DPAD_DOWN "pad_dpad_down"
++#define PAD_DPAD_LEFT "pad_dpad_left"
++#define PAD_DPAD_RIGHT "pad_dpad_right"
++#define PAD_VIEW "pad_view"
++#define PAD_MENU "pad_menu"
++#define PAD_XBOX "pad_xbox"
++
++#define KB_M1 "kb_m1"
++#define KB_M2 "kb_m2"
++#define KB_ESC "kb_esc"
++#define KB_F1 "kb_f1"
++#define KB_F2 "kb_f2"
++#define KB_F3 "kb_f3"
++#define KB_F4 "kb_f4"
++#define KB_F5 "kb_f5"
++#define KB_F6 "kb_f6"
++#define KB_F7 "kb_f7"
++#define KB_F8 "kb_f8"
++#define KB_F9 "kb_f9"
++#define KB_F10 "kb_f10"
++#define KB_F11 "kb_f11"
++#define KB_F12 "kb_f12"
++#define KB_F14 "kb_f14"
++#define KB_F15 "kb_f15"
++
++#define KB_BACKTICK "kb_backtick"
++#define KB_1 "kb_1"
++#define KB_2 "kb_2"
++#define KB_3 "kb_3"
++#define KB_4 "kb_4"
++#define KB_5 "kb_5"
++#define KB_6 "kb_6"
++#define KB_7 "kb_7"
++#define KB_8 "kb_8"
++#define KB_9 "kb_9"
++#define KB_0 "kb_0"
++#define KB_HYPHEN "kb_hyphen"
++#define KB_EQUALS "kb_equals"
++#define KB_BACKSPACE "kb_backspace"
++
++#define KB_TAB "kb_tab"
++#define KB_Q "kb_q"
++#define KB_W "kb_w"
++#define KB_E "kb_e"
++#define KB_R "kb_r"
++#define KB_T "kb_t"
++#define KB_Y "kb_y"
++#define KB_U "kb_u"
++#define KB_I "kb_i"
++#define KB_O "kb_o"
++#define KB_P "kb_p"
++#define KB_LBRACKET "kb_lbracket"
++#define KB_RBRACKET "kb_rbracket"
++#define KB_BACKSLASH "kb_bkslash"
++
++#define KB_CAPS "kb_caps"
++#define KB_A "kb_a"
++#define KB_S "kb_s"
++#define KB_D "kb_d"
++#define KB_F "kb_f"
++#define KB_G "kb_g"
++#define KB_H "kb_h"
++#define KB_J "kb_j"
++#define KB_K "kb_k"
++#define KB_L "kb_l"
++#define KB_SEMI "kb_semicolon"
++#define KB_QUOTE "kb_quote"
++#define KB_RET "kb_enter"
++
++#define KB_LSHIFT "kb_lshift"
++#define KB_Z "kb_z"
++#define KB_X "kb_x"
++#define KB_C "kb_c"
++#define KB_V "kb_v"
++#define KB_B "kb_b"
++#define KB_N "kb_n"
++#define KB_M "kb_m"
++#define KB_COMMA "kb_comma"
++#define KB_PERIOD "kb_period"
++#define KB_FWDSLASH "kb_fwdslash"
++#define KB_RSHIFT "kb_rshift"
++
++#define KB_LCTL "kb_lctl"
++#define KB_META "kb_meta"
++#define KB_LALT "kb_lalt"
++#define KB_SPACE "kb_space"
++#define KB_RALT "kb_ralt"
++#define KB_MENU "kb_menu"
++#define KB_RCTL "kb_rctl"
++
++#define KB_PRNTSCN "kb_prntscn"
++#define KB_SCRLCK "kb_scrlck"
++#define KB_PAUSE "kb_pause"
++#define KB_INS "kb_ins"
++#define KB_HOME "kb_home"
++#define KB_PGUP "kb_pgup"
++#define KB_DEL "kb_del"
++#define KB_END "kb_end"
++#define KB_PGDWN "kb_pgdwn"
++
++#define KB_UP_ARROW "kb_up_arrow"
++#define KB_DOWN_ARROW "kb_down_arrow"
++#define KB_LEFT_ARROW "kb_left_arrow"
++#define KB_RIGHT_ARROW "kb_right_arrow"
++
++#define NUMPAD_LOCK "numpad_lock"
++#define NUMPAD_FWDSLASH "numpad_fwdslash"
++#define NUMPAD_ASTERISK "numpad_asterisk"
++#define NUMPAD_HYPHEN "numpad_hyphen"
++#define NUMPAD_0 "numpad_0"
++#define NUMPAD_1 "numpad_1"
++#define NUMPAD_2 "numpad_2"
++#define NUMPAD_3 "numpad_3"
++#define NUMPAD_4 "numpad_4"
++#define NUMPAD_5 "numpad_5"
++#define NUMPAD_6 "numpad_6"
++#define NUMPAD_7 "numpad_7"
++#define NUMPAD_8 "numpad_8"
++#define NUMPAD_9 "numpad_9"
++#define NUMPAD_PLUS "numpad_plus"
++#define NUMPAD_ENTER "numpad_enter"
++#define NUMPAD_PERIOD "numpad_."
++
++#define RAT_LCLICK "rat_lclick"
++#define RAT_RCLICK "rat_rclick"
++#define RAT_MCLICK "rat_mclick"
++#define RAT_WHEEL_UP "rat_wheel_up"
++#define RAT_WHEEL_DOWN "rat_wheel_down"
++
++#define MEDIA_SCREENSHOT "media_screenshot"
++#define MEDIA_SHOW_KEYBOARD "media_show_keyboard"
++#define MEDIA_SHOW_DESKTOP "media_show_desktop"
++#define MEDIA_START_RECORDING "media_start_recording"
++#define MEDIA_MIC_OFF "media_mic_off"
++#define MEDIA_VOL_DOWN "media_vol_down"
++#define MEDIA_VOL_UP "media_vol_up"
++
++/* required so we can have nested attributes with same name but different functions */
++#define ALLY_DEVICE_ATTR_RW(_name, _sysfs_name)    \
++	struct device_attribute dev_attr_##_name = \
++		__ATTR(_sysfs_name, 0644, _name##_show, _name##_store)
++
++#define ALLY_DEVICE_ATTR_RO(_name, _sysfs_name) \
++	struct device_attribute dev_attr_##_name = __ATTR(_sysfs_name, 0444, _name##_show, NULL)
++
++#define ALLY_DEVICE_ATTR_WO(_name, _sysfs_name) \
++	struct device_attribute dev_attr_##_name = __ATTR(_sysfs_name, 0200, NULL, _name##_store)
++
++/* response curve macros */
++#define ALLY_RESP_CURVE_SHOW(_name, _point_n)                                                    \
++	static ssize_t _name##_show(struct device *raw_dev, struct device_attribute *attr,       \
++				    char *buf)                                                   \
++	{                                                                                        \
++		struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);                       \
++		int idx = (_point_n - 1) * 2;                                                    \
++		return sysfs_emit(                                                               \
++			buf, "%d %d\n",                                                          \
++			rog_ally->response_curve[rog_ally->mode][btn_pair_side_left][idx],       \
++			rog_ally->response_curve[rog_ally->mode][btn_pair_side_right][idx + 1]); \
++	}
++
++#define ALLY_RESP_CURVE_STORE(_name, _side, _point_n)                                         \
++	static ssize_t _name##_store(struct device *raw_dev, struct device_attribute *attr,   \
++				     const char *buf, size_t count)                           \
++	{                                                                                     \
++		int ret = __gamepad_store_response_curve(raw_dev, buf, btn_pair_side_##_side, \
++							 _point_n);                           \
++		if (ret < 0)                                                                  \
++			return ret;                                                           \
++		return count;                                                                 \
++	}
++
++/* _point_n must start at 1 */
++#define ALLY_JS_RC_POINT(_side, _point_n, _sysfs_label)                        \
++	ALLY_RESP_CURVE_SHOW(rc_point_##_side##_##_point_n, _point_n);         \
++	ALLY_RESP_CURVE_STORE(rc_point_##_side##_##_point_n, _side, _point_n); \
++	ALLY_DEVICE_ATTR_RW(rc_point_##_side##_##_point_n, _sysfs_label##_point_n);
++
++/* deadzone macros */
++#define ALLY_AXIS_DEADZONE_SHOW(_axis)                                                    \
++	static ssize_t _axis##_deadzone_show(struct device *raw_dev,                      \
++					     struct device_attribute *attr, char *buf)    \
++	{                                                                                 \
++		struct asus_rog_ally *rog_ally = __rog_ally_data(raw_dev);                \
++		int side, is_tr;                                                          \
++                                                                                          \
++		is_tr = _axis > xpad_axis_xy_right;                                       \
++		side = _axis == xpad_axis_xy_right || _axis == xpad_axis_z_right ? 2 : 0; \
++                                                                                          \
++		return sysfs_emit(buf, "%d %d\n",                                         \
++				  rog_ally->deadzones[rog_ally->mode][is_tr][side],       \
++				  rog_ally->deadzones[rog_ally->mode][is_tr][side + 1]);  \
++	}
++
++#define ALLY_AXIS_DEADZONE_STORE(_axis)                                                       \
++	static ssize_t _axis##_deadzone_store(struct device *raw_dev,                         \
++					      struct device_attribute *attr, const char *buf, \
++					      size_t count)                                   \
++	{                                                                                     \
++		int ret = __gamepad_store_deadzones(raw_dev, _axis, buf);                     \
++		if (ret < 0)                                                                  \
++			return ret;                                                           \
++		return count;                                                                 \
++	}
++
++#define ALLY_AXIS_DEADZONE(_axis, _sysfs_label) \
++	ALLY_AXIS_DEADZONE_SHOW(_axis);         \
++	ALLY_AXIS_DEADZONE_STORE(_axis);        \
++	ALLY_DEVICE_ATTR_RW(_axis##_deadzone, _sysfs_label);
++
++/* button specific macros */
++#define ALLY_BTN_SHOW(_fname, _pair, _side, _secondary)                                     \
++	static ssize_t _fname##_show(struct device *raw_dev, struct device_attribute *attr, \
++				     char *buf)                                             \
++	{                                                                                   \
++		return sysfs_emit(buf, "%s\n",                                              \
++				  __btn_map_to_string(raw_dev, _pair, _side, _secondary));  \
++	}
++
++#define ALLY_BTN_STORE(_fname, _pair, _side, _secondary)                                     \
++	static ssize_t _fname##_store(struct device *raw_dev, struct device_attribute *attr, \
++				      const char *buf, size_t count)                         \
++	{                                                                                    \
++		int ret = __gamepad_mapping_store(raw_dev, buf, _pair, _side, _secondary);   \
++		if (ret < 0)                                                                 \
++			return ret;                                                          \
++		return count;                                                                \
++	}
++
++#define ALLY_BTN_TURBO_SHOW(_fname, _pair, _side)                                                 \
++	static ssize_t _fname##_turbo_show(struct device *raw_dev, struct device_attribute *attr, \
++					   char *buf)                                             \
++	{                                                                                         \
++		return sysfs_emit(buf, "%d\n", __gamepad_turbo_show(raw_dev, _pair, _side));      \
++	}
++
++#define ALLY_BTN_TURBO_STORE(_fname, _pair, _side)                                                 \
++	static ssize_t _fname##_turbo_store(struct device *raw_dev, struct device_attribute *attr, \
++					    const char *buf, size_t count)                         \
++	{                                                                                          \
++		int ret = __gamepad_turbo_store(raw_dev, buf, _pair, _side);                       \
++		if (ret < 0)                                                                       \
++			return ret;                                                                \
++		return count;                                                                      \
++	}
++
++#define ALLY_BTN_ATTRS_GROUP(_name, _fname)                                                    \
++	static struct attribute *_fname##_attrs[] = { &dev_attr_##_fname.attr,                 \
++						      &dev_attr_##_fname##_macro.attr,         \
++						      &dev_attr_##_fname##_turbo.attr, NULL }; \
++	static const struct attribute_group _fname##_attr_group = {                            \
++		.name = __stringify(_name),                                                    \
++		.attrs = _fname##_attrs,                                                       \
++	};
++
++#define ALLY_BTN_MAPPING(_fname, _pair, _side)                            \
++	ALLY_BTN_SHOW(btn_mapping_##_fname, _pair, _side, false);         \
++	ALLY_BTN_STORE(btn_mapping_##_fname, _pair, _side, false);        \
++                                                                          \
++	ALLY_BTN_SHOW(btn_mapping_##_fname##_macro, _pair, _side, true);  \
++	ALLY_BTN_STORE(btn_mapping_##_fname##_macro, _pair, _side, true); \
++                                                                          \
++	ALLY_BTN_TURBO_SHOW(btn_mapping_##_fname, _pair, _side);          \
++	ALLY_BTN_TURBO_STORE(btn_mapping_##_fname, _pair, _side);         \
++                                                                          \
++	ALLY_DEVICE_ATTR_RW(btn_mapping_##_fname, remap);                 \
++	ALLY_DEVICE_ATTR_RW(btn_mapping_##_fname##_macro, macro_remap);   \
++	ALLY_DEVICE_ATTR_RW(btn_mapping_##_fname##_turbo, turbo);         \
++                                                                          \
++	ALLY_BTN_ATTRS_GROUP(btn_##_fname, btn_mapping_##_fname);
++
++/* calibration macros */
++#define ALLY_CAL_STORE(_fname, _axis)                                                  \
++	static ssize_t _fname##_store(struct device *raw_dev, struct device_attribute *attr, \
++				      const char *buf, size_t count)                         \
++	{                                                                                    \
++		int ret = __gamepad_cal_store(raw_dev, buf, _axis);                          \
++		if (ret < 0)                                                                 \
++			return ret;                                                          \
++		return count;                                                                \
++	};
++
++#define ALLY_CAL_SHOW(_fname, _axis)                                     \
++	static ssize_t _fname##_show(struct device *raw_dev, struct device_attribute *attr, \
++				     char *buf)                                             \
++	{                                                                                   \
++		return __gamepad_cal_show(raw_dev, buf, _axis);  \
++	}
++
++#define ALLY_CAL_ATTR(_fname, _axis, _sysfs_label) \
++	ALLY_CAL_STORE(_fname, _axis);             \
++	ALLY_CAL_SHOW(_fname, _axis);             \
++	ALLY_DEVICE_ATTR_RW(_fname, _sysfs_label);
++
++#define ALLY_CAL_RESET_STORE(_fname, _axis)                                                  \
++	static ssize_t _fname##_store(struct device *raw_dev, struct device_attribute *attr, \
++				      const char *buf, size_t count)                         \
++	{                                                                                    \
++		int ret = __gamepad_cal_reset(raw_dev, buf, _axis);                          \
++		if (ret < 0)                                                                 \
++			return ret;                                                          \
++		return count;                                                                \
++	};
++
++#define ALLY_CAL_RESET_ATTR(_fname, _axis, _sysfs_label) \
++	ALLY_CAL_RESET_STORE(_fname, _axis);             \
++	ALLY_DEVICE_ATTR_WO(_fname, _sysfs_label);
++
++/* Default blocks for the xpad mode */
++static const u8 XPAD_DEF1[MAPPING_BLOCK_LEN] = {
++	0x01, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x19,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x03, 0x8c, 0x88, 0x76, 0x00, 0x00
++};
++static const u8 XPAD_DEF2[MAPPING_BLOCK_LEN] = {
++	0x01, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
++	0x00, 0x02, 0x82, 0x23, 0x00, 0x00, 0x00, 0x01, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x02, 0x82, 0x0d, 0x00, 0x00, 0x00
++};
++static const u8 XPAD_DEF3[MAPPING_BLOCK_LEN] = {
++	0x01, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 XPAD_DEF4[MAPPING_BLOCK_LEN] = {
++	0x01, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 XPAD_DEF5[MAPPING_BLOCK_LEN] = {
++	0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x16,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x02, 0x82, 0x31, 0x00, 0x00, 0x00
++};
++static const u8 XPAD_DEF6[MAPPING_BLOCK_LEN] = {
++	0x01, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
++	0x00, 0x02, 0x82, 0x4d, 0x00, 0x00, 0x00, 0x01, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 XPAD_DEF7[MAPPING_BLOCK_LEN] = {
++	0x01, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 XPAD_DEF8[MAPPING_BLOCK_LEN] = {
++	0x02, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x8e, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x02, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 XPAD_DEF9[MAPPING_BLOCK_LEN] = {
++	0x01, 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++
++/* default blocks for the wasd mode */
++static const u8 WASD_DEF1[MAPPING_BLOCK_LEN] = {
++	0x02, 0x00, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x19,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x03, 0x8c, 0x88, 0x76, 0x00, 0x00
++};
++static const u8 WASD_DEF2[MAPPING_BLOCK_LEN] = {
++	0x02, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
++	0x00, 0x02, 0x82, 0x23, 0x00, 0x00, 0x00, 0x02, 0x00, 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x02, 0x82, 0x0d, 0x00, 0x00, 0x00
++};
++static const u8 WASD_DEF3[MAPPING_BLOCK_LEN] = {
++	0x02, 0x00, 0x88, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 WASD_DEF4[MAPPING_BLOCK_LEN] = {
++	0x02, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 WASD_DEF5[MAPPING_BLOCK_LEN] = {
++	0x02, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x16,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x02, 0x82, 0x31, 0x00, 0x00, 0x00
++};
++static const u8 WASD_DEF6[MAPPING_BLOCK_LEN] = {
++	0x02, 0x00, 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
++	0x00, 0x02, 0x82, 0x4d, 0x00, 0x00, 0x00, 0x02, 0x00, 0x96, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 WASD_DEF7[MAPPING_BLOCK_LEN] = {
++	0x01, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 WASD_DEF8[MAPPING_BLOCK_LEN] = {
++	0x02, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x8e, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x02, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++static const u8 WASD_DEF9[MAPPING_BLOCK_LEN] = {
++	0x04, 0x00, 0x00, 0x00, 0x00, 0x02, 0x88, 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
++	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
++};
++
++/*
++ * the xpad_mode is used inside the mode setting packet and is used
++ * for indexing (xpad_mode - 1)
++ */
++enum xpad_mode {
++	xpad_mode_game = 0x01,
++	xpad_mode_wasd = 0x02,
++	xpad_mode_mouse = 0x03,
++};
++
++/* the xpad_cmd determines which feature is set or queried */
++enum xpad_cmd {
++	xpad_cmd_set_mode = 0x01,
++	xpad_cmd_set_mapping = 0x02,
++	xpad_cmd_set_js_dz = 0x04, /* deadzones */
++	xpad_cmd_set_tr_dz = 0x05, /* deadzones */
++	xpad_cmd_set_vibe_intensity = 0x06,
++	xpad_cmd_check_ready = 0x0A,
++	xpad_cmd_set_calibration = 0x0D,
++	xpad_cmd_set_turbo = 0x0F,
++	xpad_cmd_set_response_curve = 0x13,
++	xpad_cmd_set_adz = 0x18,
++};
++
++/*
++ * the xpad_mode is used in various set and query HID packets and is
++ * used for indexing (xpad_axis - 1)
++ */
++enum xpad_axis {
++	xpad_axis_xy_left = 0x01,
++	xpad_axis_xy_right = 0x02,
++	xpad_axis_z_left = 0x03,
++	xpad_axis_z_right = 0x04,
++};
++
++enum btn_pair {
++	btn_pair_dpad_u_d = 0x01,
++	btn_pair_dpad_l_r = 0x02,
++	btn_pair_ls_rs = 0x03,
++	btn_pair_lb_rb = 0x04,
++	btn_pair_a_b = 0x05,
++	btn_pair_x_y = 0x06,
++	btn_pair_view_menu = 0x07,
++	btn_pair_m1_m2 = 0x08,
++	btn_pair_lt_rt = 0x09,
++};
++
++enum btn_pair_side {
++	btn_pair_side_left = 0x00,
++	btn_pair_side_right = 0x01,
++};
++
++static int __gamepad_write_all_to_mcu(struct device *raw_dev);
+\ No newline at end of file
+diff --git a/drivers/hid/hid-asus.h b/drivers/hid/hid-asus.h
+new file mode 100644
+index 000000000000..18317cad7110
+--- /dev/null
++++ b/drivers/hid/hid-asus.h
+@@ -0,0 +1,58 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ *  HID driver for Asus ROG laptops and Ally
++ *
++ *  Copyright (c) 2023 Luke Jones <luke@ljones.dev>
++ */
++
++#include <linux/hid.h>
++#include <linux/types.h>
++
++#define FEATURE_KBD_REPORT_ID		0x5a
++#define FEATURE_KBD_REPORT_SIZE		16
++#define FEATURE_KBD_LED_REPORT_ID1	0x5d
++#define FEATURE_KBD_LED_REPORT_ID2	0x5e
++#define FEATURE_ROG_ALLY_REPORT_SIZE	64
++
++#define QUIRK_FIX_NOTEBOOK_REPORT	BIT(0)
++#define QUIRK_NO_INIT_REPORTS		BIT(1)
++#define QUIRK_SKIP_INPUT_MAPPING	BIT(2)
++#define QUIRK_IS_MULTITOUCH		BIT(3)
++#define QUIRK_NO_CONSUMER_USAGES	BIT(4)
++#define QUIRK_USE_KBD_BACKLIGHT		BIT(5)
++#define QUIRK_T100_KEYBOARD		BIT(6)
++#define QUIRK_T100CHI			BIT(7)
++#define QUIRK_G752_KEYBOARD		BIT(8)
++#define QUIRK_T90CHI			BIT(9)
++#define QUIRK_MEDION_E1239T		BIT(10)
++#define QUIRK_ROG_NKEY_KEYBOARD		BIT(11)
++#define QUIRK_ROG_CLAYMORE_II_KEYBOARD	BIT(12)
++#define QUIRK_ROG_ALLY_XPAD		BIT(13)
++
++struct asus_drvdata {
++	unsigned long quirks;
++	struct hid_device *hdev;
++	struct input_dev *input;
++	struct input_dev *tp_kbd_input;
++	struct asus_kbd_leds *kbd_backlight;
++	const struct asus_touchpad_info *tp;
++	bool enable_backlight;
++	struct power_supply *battery;
++	struct power_supply_desc battery_desc;
++	int battery_capacity;
++	int battery_stat;
++	bool battery_in_query;
++	unsigned long battery_next_query;
++	struct asus_rog_ally *rog_ally_data;
++};
++
++extern int asus_kbd_set_report(struct hid_device *hdev, const u8 *buf, size_t buf_size);
++
++extern int asus_kbd_get_report(struct hid_device *hdev, u8 *out_buf, size_t out_buf_size);
++
++struct rog_ops {
++	int (*probe) (struct hid_device *hdev, const struct rog_ops *ops);
++	void (*remove) (struct hid_device *hdev, const struct rog_ops *ops);
++};
++
++extern const struct rog_ops rog_ally;
+\ No newline at end of file
+diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h
+index 828a5c022c64..79ae30212725 100644
+--- a/drivers/hid/hid-ids.h
++++ b/drivers/hid/hid-ids.h
+@@ -208,6 +208,7 @@
+ #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_KEYBOARD	0x1866
+ #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_KEYBOARD2	0x19b6
+ #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_KEYBOARD3	0x1a30
++#define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY		0x1abe
+ #define USB_DEVICE_ID_ASUSTEK_ROG_CLAYMORE_II_KEYBOARD	0x196b
+ #define USB_DEVICE_ID_ASUSTEK_FX503VD_KEYBOARD	0x1869
+ 
 -- 
 2.44.0
 
-From e01d8909a6a6d90eb2ff29871d79f4e9359638ca Mon Sep 17 00:00:00 2001
+From a9a585447d5ab5667a7139052240a9abd6eb20b2 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 26 Feb 2024 15:48:00 +0100
-Subject: [PATCH 6/7] ksm
+Date: Wed, 3 Apr 2024 17:07:11 +0200
+Subject: [PATCH 7/8] ksm
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -9005,7 +13871,7 @@ index dd116598fb25..28e59bfe9474 100644
 +463	common	process_ksm_disable		sys_process_ksm_disable
 +464	common	process_ksm_status		sys_process_ksm_status
 diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
-index 77eb9b0e7685..c30bd8068c83 100644
+index e619ac10cd23..ad49152558ac 100644
 --- a/include/linux/syscalls.h
 +++ b/include/linux/syscalls.h
 @@ -818,6 +818,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
@@ -9042,10 +13908,10 @@ index 75f00965ab15..c46daa8bda1e 100644
  /*
   * 32 bit systems traditionally used different
 diff --git a/kernel/sys.c b/kernel/sys.c
-index f8e543f1e38a..a02487c059f3 100644
+index 8bb106a56b3a..34b78c8e7bc6 100644
 --- a/kernel/sys.c
 +++ b/kernel/sys.c
-@@ -2764,6 +2764,153 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
+@@ -2767,6 +2767,153 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
  	return error;
  }
  
@@ -9216,72 +14082,72 @@ index faad00cce269..c7c9eb656468 100644
 -- 
 2.44.0
 
-From 0634ad09765970da5be85d61cb4b8b4b38adb3c0 Mon Sep 17 00:00:00 2001
+From ef7086e8c29b16dc397fcf7e29f4608865f41cad Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Thu, 1 Feb 2024 16:54:48 +0100
-Subject: [PATCH 7/7] zstd
+Date: Wed, 3 Apr 2024 17:07:41 +0200
+Subject: [PATCH 8/8] zstd
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
  include/linux/zstd.h                          |    2 +-
  include/linux/zstd_errors.h                   |   23 +-
- include/linux/zstd_lib.h                      |  697 +++++--
+ include/linux/zstd_lib.h                      |  850 +++++--
  lib/zstd/Makefile                             |    2 +-
  lib/zstd/common/allocations.h                 |   56 +
  lib/zstd/common/bits.h                        |  149 ++
- lib/zstd/common/bitstream.h                   |   53 +-
- lib/zstd/common/compiler.h                    |   14 +-
+ lib/zstd/common/bitstream.h                   |  127 +-
+ lib/zstd/common/compiler.h                    |  134 +-
  lib/zstd/common/cpu.h                         |    3 +-
- lib/zstd/common/debug.c                       |    5 +-
- lib/zstd/common/debug.h                       |    3 +-
+ lib/zstd/common/debug.c                       |    9 +-
+ lib/zstd/common/debug.h                       |   34 +-
  lib/zstd/common/entropy_common.c              |   42 +-
  lib/zstd/common/error_private.c               |   12 +-
- lib/zstd/common/error_private.h               |    3 +-
- lib/zstd/common/fse.h                         |   89 +-
- lib/zstd/common/fse_decompress.c              |   94 +-
- lib/zstd/common/huf.h                         |  222 +--
- lib/zstd/common/mem.h                         |    2 +-
- lib/zstd/common/portability_macros.h          |   26 +-
+ lib/zstd/common/error_private.h               |   84 +-
+ lib/zstd/common/fse.h                         |   94 +-
+ lib/zstd/common/fse_decompress.c              |  130 +-
+ lib/zstd/common/huf.h                         |  237 +-
+ lib/zstd/common/mem.h                         |    3 +-
+ lib/zstd/common/portability_macros.h          |   28 +-
  lib/zstd/common/zstd_common.c                 |   38 +-
  lib/zstd/common/zstd_deps.h                   |   16 +-
- lib/zstd/common/zstd_internal.h               |   99 +-
+ lib/zstd/common/zstd_internal.h               |  109 +-
  lib/zstd/compress/clevels.h                   |    3 +-
- lib/zstd/compress/fse_compress.c              |   59 +-
+ lib/zstd/compress/fse_compress.c              |   74 +-
  lib/zstd/compress/hist.c                      |    3 +-
  lib/zstd/compress/hist.h                      |    3 +-
- lib/zstd/compress/huf_compress.c              |  372 ++--
- lib/zstd/compress/zstd_compress.c             | 1762 ++++++++++++-----
- lib/zstd/compress/zstd_compress_internal.h    |  333 +++-
+ lib/zstd/compress/huf_compress.c              |  441 ++--
+ lib/zstd/compress/zstd_compress.c             | 2111 ++++++++++++-----
+ lib/zstd/compress/zstd_compress_internal.h    |  359 ++-
  lib/zstd/compress/zstd_compress_literals.c    |  155 +-
  lib/zstd/compress/zstd_compress_literals.h    |   25 +-
  lib/zstd/compress/zstd_compress_sequences.c   |    7 +-
  lib/zstd/compress/zstd_compress_sequences.h   |    3 +-
- lib/zstd/compress/zstd_compress_superblock.c  |   47 +-
+ lib/zstd/compress/zstd_compress_superblock.c  |  376 ++-
  lib/zstd/compress/zstd_compress_superblock.h  |    3 +-
- lib/zstd/compress/zstd_cwksp.h                |  149 +-
- lib/zstd/compress/zstd_double_fast.c          |  129 +-
- lib/zstd/compress/zstd_double_fast.h          |    6 +-
- lib/zstd/compress/zstd_fast.c                 |  582 ++++--
+ lib/zstd/compress/zstd_cwksp.h                |  169 +-
+ lib/zstd/compress/zstd_double_fast.c          |  143 +-
+ lib/zstd/compress/zstd_double_fast.h          |   17 +-
+ lib/zstd/compress/zstd_fast.c                 |  596 +++--
  lib/zstd/compress/zstd_fast.h                 |    6 +-
- lib/zstd/compress/zstd_lazy.c                 |  518 ++---
- lib/zstd/compress/zstd_lazy.h                 |    7 +-
- lib/zstd/compress/zstd_ldm.c                  |   11 +-
+ lib/zstd/compress/zstd_lazy.c                 |  732 +++---
+ lib/zstd/compress/zstd_lazy.h                 |  138 +-
+ lib/zstd/compress/zstd_ldm.c                  |   21 +-
  lib/zstd/compress/zstd_ldm.h                  |    3 +-
  lib/zstd/compress/zstd_ldm_geartab.h          |    3 +-
- lib/zstd/compress/zstd_opt.c                  |  187 +-
- lib/zstd/compress/zstd_opt.h                  |    3 +-
- lib/zstd/decompress/huf_decompress.c          |  770 ++++---
+ lib/zstd/compress/zstd_opt.c                  |  497 ++--
+ lib/zstd/compress/zstd_opt.h                  |   41 +-
+ lib/zstd/decompress/huf_decompress.c          |  887 ++++---
  lib/zstd/decompress/zstd_ddict.c              |    9 +-
  lib/zstd/decompress/zstd_ddict.h              |    3 +-
- lib/zstd/decompress/zstd_decompress.c         |  263 ++-
- lib/zstd/decompress/zstd_decompress_block.c   |  283 ++-
- lib/zstd/decompress/zstd_decompress_block.h   |    8 +-
- .../decompress/zstd_decompress_internal.h     |    7 +-
+ lib/zstd/decompress/zstd_decompress.c         |  358 ++-
+ lib/zstd/decompress/zstd_decompress_block.c   |  708 +++---
+ lib/zstd/decompress/zstd_decompress_block.h   |   10 +-
+ .../decompress/zstd_decompress_internal.h     |    9 +-
  lib/zstd/decompress_sources.h                 |    2 +-
  lib/zstd/zstd_common_module.c                 |    5 +-
  lib/zstd/zstd_compress_module.c               |    2 +-
  lib/zstd/zstd_decompress_module.c             |    4 +-
- 58 files changed, 4790 insertions(+), 2595 deletions(-)
+ 58 files changed, 6577 insertions(+), 3531 deletions(-)
  create mode 100644 lib/zstd/common/allocations.h
  create mode 100644 lib/zstd/common/bits.h
 
@@ -9364,7 +14230,7 @@ index 58b6dd45a969..6d5cf55f0bf3 100644
  } ZSTD_ErrorCode;
  
 diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
-index 79d55465d5c1..8b4ffe649df5 100644
+index 79d55465d5c1..6320fedcf8a4 100644
 --- a/include/linux/zstd_lib.h
 +++ b/include/linux/zstd_lib.h
 @@ -1,5 +1,6 @@
@@ -9427,7 +14293,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
  #define ZSTD_VERSION_MAJOR    1
  #define ZSTD_VERSION_MINOR    5
 -#define ZSTD_VERSION_RELEASE  2
-+#define ZSTD_VERSION_RELEASE  5
++#define ZSTD_VERSION_RELEASE  6
  #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
  
  /*! ZSTD_versionNumber() :
@@ -9474,7 +14340,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
 + * for example to size a static array on stack.
 + * Will produce constant value 0 if srcSize too large.
 + */
-+#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00LLU : 0xFF00FF00U)
++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U)
 +#define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
 +ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
 +/* ZSTD_isError() :
@@ -9485,7 +14351,90 @@ index 79d55465d5c1..8b4ffe649df5 100644
  ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
  ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
  ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
-@@ -412,6 +457,9 @@ typedef enum {
+@@ -183,7 +228,7 @@ ZSTDLIB_API int         ZSTD_defaultCLevel(void);           /*!< default compres
+ /*= Compression context
+  *  When compressing many times,
+  *  it is recommended to allocate a context just once,
+- *  and re-use it for each successive compression operation.
++ *  and reuse it for each successive compression operation.
+  *  This will make workload friendlier for system's memory.
+  *  Note : re-using context is just a speed / resource optimization.
+  *         It doesn't change the compression ratio, which remains identical.
+@@ -196,9 +241,9 @@ ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);  /* accept NULL pointer *
+ 
+ /*! ZSTD_compressCCtx() :
+  *  Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
+- *  Important : in order to behave similarly to `ZSTD_compress()`,
+- *  this function compresses at requested compression level,
+- *  __ignoring any other parameter__ .
++ *  Important : in order to mirror `ZSTD_compress()` behavior,
++ *  this function compresses at the requested compression level,
++ *  __ignoring any other advanced parameter__ .
+  *  If any advanced parameter was set using the advanced API,
+  *  they will all be reset. Only `compressionLevel` remains.
+  */
+@@ -210,7 +255,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+ /*= Decompression context
+  *  When decompressing many times,
+  *  it is recommended to allocate a context only once,
+- *  and re-use it for each successive compression operation.
++ *  and reuse it for each successive compression operation.
+  *  This will make workload friendlier for system's memory.
+  *  Use one context per thread for parallel execution. */
+ typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+@@ -220,7 +265,7 @@ ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);  /* accept NULL pointer *
+ /*! ZSTD_decompressDCtx() :
+  *  Same as ZSTD_decompress(),
+  *  requires an allocated ZSTD_DCtx.
+- *  Compatible with sticky parameters.
++ *  Compatible with sticky parameters (see below).
+  */
+ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+                                        void* dst, size_t dstCapacity,
+@@ -236,12 +281,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+  *   using ZSTD_CCtx_set*() functions.
+  *   Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
+  *   "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
+- *   __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
++ *   __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ .
+  *
+  *   It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
+  *
+  *   This API supersedes all other "advanced" API entry points in the experimental section.
+- *   In the future, we expect to remove from experimental API entry points which are redundant with this API.
++ *   In the future, we expect to remove API entry points from experimental which are redundant with this API.
+  */
+ 
+ 
+@@ -324,6 +369,19 @@ typedef enum {
+                               * The higher the value of selected strategy, the more complex it is,
+                               * resulting in stronger and slower compression.
+                               * Special: value 0 means "use default strategy". */
++
++    ZSTD_c_targetCBlockSize=130, /* v1.5.6+
++                                  * Attempts to fit compressed block size into approximatively targetCBlockSize.
++                                  * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX.
++                                  * Note that it's not a guarantee, just a convergence target (default:0).
++                                  * No target when targetCBlockSize == 0.
++                                  * This is helpful in low bandwidth streaming environments to improve end-to-end latency,
++                                  * when a client can make use of partial documents (a prominent example being Chrome).
++                                  * Note: this parameter is stable since v1.5.6.
++                                  * It was present as an experimental parameter in earlier versions,
++                                  * but it's not recommended using it with earlier library versions
++                                  * due to massive performance regressions.
++                                  */
+     /* LDM mode parameters */
+     ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                      * This parameter is designed to improve compression ratio
+@@ -403,7 +461,6 @@ typedef enum {
+      * ZSTD_c_forceMaxWindow
+      * ZSTD_c_forceAttachDict
+      * ZSTD_c_literalCompressionMode
+-     * ZSTD_c_targetCBlockSize
+      * ZSTD_c_srcSizeHint
+      * ZSTD_c_enableDedicatedDictSearch
+      * ZSTD_c_stableInBuffer
+@@ -412,6 +469,9 @@ typedef enum {
       * ZSTD_c_validateSequences
       * ZSTD_c_useBlockSplitter
       * ZSTD_c_useRowMatchFinder
@@ -9495,7 +14444,16 @@ index 79d55465d5c1..8b4ffe649df5 100644
       * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
       * note : never ever use experimentalParam? names directly;
       *        also, the enums values themselves are unstable and can still change.
-@@ -430,7 +478,11 @@ typedef enum {
+@@ -421,7 +481,7 @@ typedef enum {
+      ZSTD_c_experimentalParam3=1000,
+      ZSTD_c_experimentalParam4=1001,
+      ZSTD_c_experimentalParam5=1002,
+-     ZSTD_c_experimentalParam6=1003,
++     /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */
+      ZSTD_c_experimentalParam7=1004,
+      ZSTD_c_experimentalParam8=1005,
+      ZSTD_c_experimentalParam9=1006,
+@@ -430,7 +490,11 @@ typedef enum {
       ZSTD_c_experimentalParam12=1009,
       ZSTD_c_experimentalParam13=1010,
       ZSTD_c_experimentalParam14=1011,
@@ -9508,7 +14466,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
  } ZSTD_cParameter;
  
  typedef struct {
-@@ -493,7 +545,7 @@ typedef enum {
+@@ -493,7 +557,7 @@ typedef enum {
   *                  They will be used to compress next frame.
   *                  Resetting session never fails.
   *  - The parameters : changes all parameters back to "default".
@@ -9517,7 +14475,12 @@ index 79d55465d5c1..8b4ffe649df5 100644
   *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
   *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
   *  - Both : similar to resetting the session, followed by resetting parameters.
-@@ -506,7 +558,8 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+@@ -502,11 +566,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+ 
+ /*! ZSTD_compress2() :
+  *  Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
++ *  (note that this entry point doesn't even expose a compression level parameter).
+  *  ZSTD_compress2() always starts a new frame.
   *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
   *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
   *  - The function is always blocking, returns when compression is completed.
@@ -9527,11 +14490,12 @@ index 79d55465d5c1..8b4ffe649df5 100644
   * @return : compressed size written into `dst` (<= `dstCapacity),
   *           or an error code if it fails (which can be tested using ZSTD_isError()).
   */
-@@ -543,13 +596,15 @@ typedef enum {
+@@ -543,13 +609,17 @@ typedef enum {
       * ZSTD_d_stableOutBuffer
       * ZSTD_d_forceIgnoreChecksum
       * ZSTD_d_refMultipleDDicts
 +     * ZSTD_d_disableHuffmanAssembly
++     * ZSTD_d_maxBlockSize
       * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
       * note : never ever use experimentalParam? names directly
       */
@@ -9540,11 +14504,41 @@ index 79d55465d5c1..8b4ffe649df5 100644
       ZSTD_d_experimentalParam3=1002,
 -     ZSTD_d_experimentalParam4=1003
 +     ZSTD_d_experimentalParam4=1003,
-+     ZSTD_d_experimentalParam5=1004
++     ZSTD_d_experimentalParam5=1004,
++     ZSTD_d_experimentalParam6=1005
  
  } ZSTD_dParameter;
  
-@@ -728,8 +783,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+@@ -604,14 +674,14 @@ typedef struct ZSTD_outBuffer_s {
+ *  A ZSTD_CStream object is required to track streaming operation.
+ *  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+ *  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+-*  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
++*  It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+ *
+ *  For parallel execution, use one separate ZSTD_CStream per thread.
+ *
+ *  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
+ *
+ *  Parameters are sticky : when starting a new compression on the same context,
+-*  it will re-use the same sticky parameters as previous compression session.
++*  it will reuse the same sticky parameters as previous compression session.
+ *  When in doubt, it's recommended to fully initialize the context before usage.
+ *  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
+ *  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
+@@ -700,6 +770,11 @@ typedef enum {
+  *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+  *            Before starting a new compression job, or changing compression parameters,
+  *            it is required to fully flush internal buffers.
++ *  - note: if an operation ends with an error, it may leave @cctx in an undefined state.
++ *          Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state.
++ *          In order to be re-employed after an error, a state must be reset,
++ *          which can be done explicitly (ZSTD_CCtx_reset()),
++ *          or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx())
+  */
+ ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                                          ZSTD_outBuffer* output,
+@@ -728,8 +803,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
   * This following is a legacy streaming API, available since v1.0+ .
   * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
   * It is redundant, but remains fully supported.
@@ -9553,7 +14547,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
   ******************************************************************************/
  
  /*!
-@@ -738,6 +791,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
+@@ -738,6 +811,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
   *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
   *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
   *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
@@ -9563,7 +14557,16 @@ index 79d55465d5c1..8b4ffe649df5 100644
   */
  ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
  /*!
-@@ -788,13 +844,31 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer
+@@ -758,7 +834,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+ *
+ *  A ZSTD_DStream object is required to track streaming operations.
+ *  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+-*  ZSTD_DStream objects can be re-used multiple times.
++*  ZSTD_DStream objects can be reused multiple times.
+ *
+ *  Use ZSTD_initDStream() to start a new decompression operation.
+ * @return : recommended first input size
+@@ -788,13 +864,37 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer
  
  /*===== Streaming decompression functions =====*/
  
@@ -9592,11 +14595,17 @@ index 79d55465d5c1..8b4ffe649df5 100644
 + * @return : 0 when a frame is completely decoded and fully flushed,
 + *           or an error code, which can be tested using ZSTD_isError(),
 + *           or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
++ *
++ * Note: when an operation returns with an error code, the @zds state may be left in undefined state.
++ *       It's UB to invoke `ZSTD_decompressStream()` on such a state.
++ *       In order to re-use such a state, it must be first reset,
++ *       which can be done explicitly (`ZSTD_DCtx_reset()`),
++ *       or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`)
 + */
  ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
  
  ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
-@@ -913,7 +987,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+@@ -913,7 +1013,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
   *  If @return == 0, the dictID could not be decoded.
   *  This could for one of the following reasons :
   *  - The frame does not require a dictionary to be decoded (most common case).
@@ -9605,7 +14614,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
   *    Note : this use case also happens when using a non-conformant dictionary.
   *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
   *  - This is not a Zstandard frame.
-@@ -925,9 +999,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+@@ -925,9 +1025,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
   * Advanced dictionary and prefix API (Requires v1.4.0+)
   *
   * This API allows dictionaries to be used with ZSTD_compress2(),
@@ -9613,14 +14622,14 @@ index 79d55465d5c1..8b4ffe649df5 100644
 - * only reset with the context is reset with ZSTD_reset_parameters or
 - * ZSTD_reset_session_and_parameters. Prefixes are single-use.
 + * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
-+ * Dictionaries are sticky, they remain valid when same context is re-used,
++ * Dictionaries are sticky, they remain valid when same context is reused,
 + * they only reset when the context is reset
 + * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
 + * In contrast, Prefixes are single-use.
   ******************************************************************************/
  
  
-@@ -937,8 +1013,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+@@ -937,8 +1039,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
   * @result : 0, or an error code (which can be tested with ZSTD_isError()).
   *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
   *           meaning "return to no-dictionary mode".
@@ -9632,7 +14641,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
   *  Note 2 : Loading a dictionary involves building tables.
   *           It's also a CPU consuming operation, with non-negligible impact on latency.
   *           Tables are dependent on compression parameters, and for this reason,
-@@ -947,11 +1024,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+@@ -947,11 +1050,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
   *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
   *           In such a case, dictionary buffer must outlive its users.
   *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
@@ -9650,7 +14659,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
   *  Note that compression parameters are enforced from within CDict,
   *  and supersede any compression parameter previously set within CCtx.
   *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
-@@ -970,6 +1051,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+@@ -970,6 +1077,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
   *  Decompression will need same prefix to properly regenerate data.
   *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
   *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
@@ -9658,7 +14667,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
   * @result : 0, or an error code (which can be tested with ZSTD_isError()).
   *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
   *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
-@@ -986,9 +1068,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+@@ -986,9 +1094,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
                                   const void* prefix, size_t prefixSize);
  
  /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
@@ -9671,7 +14680,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
   * @result : 0, or an error code (which can be tested with ZSTD_isError()).
   *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
   *            meaning "return to no-dictionary mode".
-@@ -1012,9 +1094,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
+@@ -1012,9 +1120,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
   *  The memory for the table is allocated on the first call to refDDict, and can be
   *  freed with ZSTD_freeDCtx().
   *
@@ -9684,7 +14693,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
   *  Special: referencing a NULL DDict means "return to no-dictionary mode".
   *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
   */
-@@ -1071,24 +1154,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+@@ -1071,24 +1180,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
  #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
  #endif
  
@@ -9709,7 +14718,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
  /* **************************************************************************************
   *   experimental API (static linking only)
   ****************************************************************************************
-@@ -1123,6 +1188,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+@@ -1123,6 +1214,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
  #define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
  #define ZSTD_STRATEGY_MIN        ZSTD_fast
  #define ZSTD_STRATEGY_MAX        ZSTD_btultra2
@@ -9717,7 +14726,16 @@ index 79d55465d5c1..8b4ffe649df5 100644
  
  
  #define ZSTD_OVERLAPLOG_MIN       0
-@@ -1303,7 +1369,7 @@ typedef enum {
+@@ -1146,7 +1238,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+ #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+ 
+ /* Advanced parameter bounds */
+-#define ZSTD_TARGETCBLOCKSIZE_MIN   64
++#define ZSTD_TARGETCBLOCKSIZE_MIN   1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */
+ #define ZSTD_TARGETCBLOCKSIZE_MAX   ZSTD_BLOCKSIZE_MAX
+ #define ZSTD_SRCSIZEHINT_MIN        0
+ #define ZSTD_SRCSIZEHINT_MAX        INT_MAX
+@@ -1303,7 +1395,7 @@ typedef enum {
  } ZSTD_paramSwitch_e;
  
  /* *************************************
@@ -9726,7 +14744,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
  ***************************************/
  
  /*! ZSTD_findDecompressedSize() :
-@@ -1350,29 +1416,109 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
+@@ -1350,29 +1442,122 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
   *           or an error code (if srcSize is too small) */
  ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
  
@@ -9815,7 +14833,23 @@ index 79d55465d5c1..8b4ffe649df5 100644
 +
  /*! ZSTD_generateSequences() :
 - * Generate sequences using ZSTD_compress2, given a source buffer.
++ * WARNING: This function is meant for debugging and informational purposes ONLY!
++ * Its implementation is flawed, and it will be deleted in a future version.
++ * It is not guaranteed to succeed, as there are several cases where it will give
++ * up and fail. You should NOT use this function in production code.
++ *
++ * This function is deprecated, and will be removed in a future version.
++ *
 + * Generate sequences using ZSTD_compress2(), given a source buffer.
++ *
++ * @param zc The compression context to be used for ZSTD_compress2(). Set any
++ *           compression parameters you need on this context.
++ * @param outSeqs The output sequences buffer of size @p outSeqsSize
++ * @param outSeqsSize The size of the output sequences buffer.
++ *                    ZSTD_sequenceBound(srcSize) is an upper bound on the number
++ *                    of sequences that can be generated.
++ * @param src The source buffer to generate sequences from of size @p srcSize.
++ * @param srcSize The size of the source buffer.
   *
   * Each block will end with a dummy sequence
   * with offset == 0, matchLength == 0, and litLength == length of last literals.
@@ -9824,24 +14858,26 @@ index 79d55465d5c1..8b4ffe649df5 100644
   *
 - * zc can be used to insert custom compression params.
 - * This function invokes ZSTD_compress2
-+ * @zc can be used to insert custom compression params.
-+ * This function invokes ZSTD_compress2().
-  *
-  * The output of this function can be fed into ZSTD_compressSequences() with CCtx
-  * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
-  * @return : number of sequences generated
+- *
+- * The output of this function can be fed into ZSTD_compressSequences() with CCtx
+- * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
+- * @return : number of sequences generated
++ * @returns The number of sequences generated, necessarily less than
++ *          ZSTD_sequenceBound(srcSize), or an error code that can be checked
++ *          with ZSTD_isError().
   */
- 
+-
 -ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
 -                                          size_t outSeqsSize, const void* src, size_t srcSize);
++ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()")
 +ZSTDLIB_STATIC_API size_t
-+ZSTD_generateSequences( ZSTD_CCtx* zc,
-+                        ZSTD_Sequence* outSeqs, size_t outSeqsSize,
-+                        const void* src, size_t srcSize);
++ZSTD_generateSequences(ZSTD_CCtx* zc,
++                       ZSTD_Sequence* outSeqs, size_t outSeqsSize,
++                       const void* src, size_t srcSize);
  
  /*! ZSTD_mergeBlockDelimiters() :
   * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
-@@ -1388,7 +1534,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
+@@ -1388,7 +1573,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
  ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
  
  /*! ZSTD_compressSequences() :
@@ -9852,7 +14888,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
   * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
   * The entire source is compressed into a single frame.
   *
-@@ -1413,11 +1561,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
+@@ -1413,11 +1600,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
   * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
   * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
   *         and cannot emit an RLE block that disagrees with the repcode history
@@ -9869,34 +14905,82 @@ index 79d55465d5c1..8b4ffe649df5 100644
  
  
  /*! ZSTD_writeSkippableFrame() :
-@@ -1481,8 +1630,11 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
+@@ -1464,48 +1652,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
+ /*! ZSTD_estimate*() :
+  *  These functions make it possible to estimate memory usage
+  *  of a future {D,C}Ctx, before its creation.
++ *  This is useful in combination with ZSTD_initStatic(),
++ *  which makes it possible to employ a static buffer for ZSTD_CCtx* state.
+  *
+  *  ZSTD_estimateCCtxSize() will provide a memory budget large enough
+- *  for any compression level up to selected one.
+- *  Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
+- *         does not include space for a window buffer.
+- *         Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
++ *  to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2()
++ *  associated with any compression level up to max specified one.
+  *  The estimate will assume the input may be arbitrarily large,
+  *  which is the worst case.
+  *
++ *  Note that the size estimation is specific for one-shot compression,
++ *  it is not valid for streaming (see ZSTD_estimateCStreamSize*())
++ *  nor other potential ways of using a ZSTD_CCtx* state.
++ *
+  *  When srcSize can be bound by a known and rather "small" value,
+- *  this fact can be used to provide a tighter estimation
+- *  because the CCtx compression context will need less memory.
+- *  This tighter estimation can be provided by more advanced functions
++ *  this knowledge can be used to provide a tighter budget estimation
++ *  because the ZSTD_CCtx* state will need less memory for small inputs.
++ *  This tighter estimation can be provided by employing more advanced functions
+  *  ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
   *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
   *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
   *
 - *  Note 2 : only single-threaded compression is supported.
 + *  Note : only single-threaded compression is supported.
   *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
-+ *
-+ *  Note 2 : ZSTD_estimateCCtxSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
-+ *  Size estimates assume that no external sequence producer is registered.
   */
- ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
++ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel);
  ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
-@@ -1501,7 +1653,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
+ 
+ /*! ZSTD_estimateCStreamSize() :
+- *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+- *  It will also consider src size to be arbitrarily "large", which is worst case.
++ *  ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression
++ *  using any compression level up to the max specified one.
++ *  It will also consider src size to be arbitrarily "large", which is a worst case scenario.
+  *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+  *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+  *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
+  *  Note : CStream size estimation is only correct for single-threaded compression.
+- *  ZSTD_DStream memory budget depends on window Size.
++ *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
++ *  Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
++ *  Size estimates assume that no external sequence producer is registered.
++ *
++ *  ZSTD_DStream memory budget depends on frame's window Size.
+  *  This information can be passed manually, using ZSTD_estimateDStreamSize,
   *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
++ *  Any frame requesting a window size larger than max specified one will be rejected.
   *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
   *         an internal ?Dict will be created, which additional size is not estimated here.
 - *         In this case, get total size by adding ZSTD_estimate?DictSize */
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
 + *         In this case, get total size by adding ZSTD_estimate?DictSize
-+ *  Note 2 : only single-threaded compression is supported.
-+ *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
-+ *  Note 3 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
-+ *  Size estimates assume that no external sequence producer is registered.
 + */
- ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
++ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel);
  ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
  ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
-@@ -1649,22 +1806,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+-ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
++ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize);
+ ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+ 
+ /*! ZSTD_estimate?DictSize() :
+@@ -1649,22 +1848,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
   *  This function never fails (wide contract) */
  ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
  
@@ -9946,7 +15030,19 @@ index 79d55465d5c1..8b4ffe649df5 100644
  size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
                                                void* dst, size_t dstCapacity,
                                          const void* src, size_t srcSize,
-@@ -1808,13 +1988,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+@@ -1737,11 +1959,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+  */
+ #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
+ 
+-/* Tries to fit compressed block size to be around targetCBlockSize.
+- * No target when targetCBlockSize == 0.
+- * There is no guarantee on compressed block size (default:0) */
+-#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
+-
+ /* User's best guess of source size.
+  * Hint is not valid when srcSizeHint == 0.
+  * There is no guarantee that hint is close to actual source size,
+@@ -1808,13 +2025,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
   * Experimental parameter.
   * Default is 0 == disabled. Set to 1 to enable.
   *
@@ -9970,7 +15066,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
   *
   * When this flag is enabled zstd won't allocate an input window buffer,
   * because the user guarantees it can reference the ZSTD_inBuffer until
-@@ -1822,18 +2005,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+@@ -1822,18 +2042,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
   * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
   * avoid the memcpy() from the input buffer to the input window buffer.
   *
@@ -9994,7 +15090,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
   */
  #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
  
-@@ -1878,7 +2058,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+@@ -1878,7 +2095,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
   * Without validation, providing a sequence that does not conform to the zstd spec will cause
   * undefined behavior, and may produce a corrupted block.
   *
@@ -10003,7 +15099,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
   * specifics regarding offset/matchlength requirements) then the function will bail out and
   * return an error.
   *
-@@ -1928,6 +2108,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
+@@ -1928,6 +2145,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
   */
  #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
  
@@ -10083,7 +15179,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
  /*! ZSTD_CCtx_getParameter() :
   *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
   *  and store it into int* value.
-@@ -2084,7 +2337,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+@@ -2084,7 +2374,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
   * in the range [dst, dst + pos) MUST not be modified during decompression
   * or you will get data corruption.
   *
@@ -10092,7 +15188,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
   * it can write directly to the ZSTD_outBuffer, but it will still allocate
   * an input buffer large enough to fit any compressed block. This will also
   * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
-@@ -2137,6 +2390,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+@@ -2137,6 +2427,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
   */
  #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
  
@@ -10106,11 +15202,27 @@ index 79d55465d5c1..8b4ffe649df5 100644
 + * ZSTD_DISABLE_ASM.
 + */
 +#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
++
++/* ZSTD_d_maxBlockSize
++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
++ *
++ * Forces the decompressor to reject blocks whose content size is
++ * larger than the configured maxBlockSize. When maxBlockSize is
++ * larger than the windowSize, the windowSize is used instead.
++ * This saves memory on the decoder when you know all blocks are small.
++ *
++ * This option is typically used in conjunction with ZSTD_c_maxBlockSize.
++ *
++ * WARNING: This causes the decoder to reject otherwise valid frames
++ * that have block sizes larger than the configured maxBlockSize.
++ */
++#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6
 +
  
  /*! ZSTD_DCtx_setFormat() :
   *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
-@@ -2145,6 +2409,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
+@@ -2145,6 +2462,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
   *  such ZSTD_f_zstd1_magicless for example.
   * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
  ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
@@ -10118,7 +15230,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
  size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
  
  /*! ZSTD_decompressStream_simpleArgs() :
-@@ -2181,6 +2446,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
+@@ -2181,6 +2499,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
   * This prototype will generate compilation warnings.
   */
  ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
@@ -10126,7 +15238,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
  size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
                           int compressionLevel,
                           unsigned long long pledgedSrcSize);
-@@ -2198,17 +2464,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+@@ -2198,17 +2517,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
   * This prototype will generate compilation warnings.
   */
  ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
@@ -10147,7 +15259,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
   *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
   *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
   *
-@@ -2218,6 +2482,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+@@ -2218,6 +2535,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
   * This prototype will generate compilation warnings.
   */
  ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
@@ -10155,7 +15267,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
  size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
                      const void* dict, size_t dictSize,
                            ZSTD_parameters params,
-@@ -2232,15 +2497,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+@@ -2232,15 +2550,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
   * This prototype will generate compilation warnings.
   */
  ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
@@ -10174,7 +15286,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
   *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
   *     ZSTD_CCtx_refCDict(zcs, cdict);
   *
-@@ -2250,6 +2513,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+@@ -2250,6 +2566,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
   * This prototype will generate compilation warnings.
   */
  ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
@@ -10182,7 +15294,16 @@ index 79d55465d5c1..8b4ffe649df5 100644
  size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
                                 const ZSTD_CDict* cdict,
                                       ZSTD_frameParameters fParams,
-@@ -2274,6 +2538,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+@@ -2264,7 +2581,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+  *       explicitly specified.
+  *
+  *  start a new frame, using same parameters from previous frame.
+- *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
++ *  This is typically useful to skip dictionary loading stage, since it will reuse it in-place.
+  *  Note that zcs must be init at least once before using ZSTD_resetCStream().
+  *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+  *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+@@ -2274,6 +2591,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
   *  This prototype will generate compilation warnings.
   */
  ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
@@ -10190,7 +15311,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
  size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
  
  
-@@ -2319,8 +2584,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+@@ -2319,8 +2637,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
   *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
   *
   * note: no dictionary will be used if dict == NULL or dictSize < 8
@@ -10200,7 +15321,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
  ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
  
  /*!
-@@ -2330,8 +2595,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
+@@ -2330,8 +2648,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
   *     ZSTD_DCtx_refDDict(zds, ddict);
   *
   * note : ddict is referenced, it must outlive decompression session
@@ -10210,11 +15331,13 @@ index 79d55465d5c1..8b4ffe649df5 100644
  ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
  
  /*!
-@@ -2340,17 +2605,185 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z
+@@ -2339,18 +2657,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z
+  *
   *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
   *
-  * re-use decompression parameters from previous init; saves dictionary loading
+- * re-use decompression parameters from previous init; saves dictionary loading
 - * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
++ * reuse decompression parameters from previous init; saves dictionary loading
   */
 +ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions")
  ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
@@ -10346,7 +15469,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
 +
 +#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
 +
-+typedef size_t ZSTD_sequenceProducer_F (
++typedef size_t (*ZSTD_sequenceProducer_F) (
 +  void* sequenceProducerState,
 +  ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
 +  const void* src, size_t srcSize,
@@ -10378,7 +15501,23 @@ index 79d55465d5c1..8b4ffe649df5 100644
 +ZSTD_registerSequenceProducer(
 +  ZSTD_CCtx* cctx,
 +  void* sequenceProducerState,
-+  ZSTD_sequenceProducer_F* sequenceProducer
++  ZSTD_sequenceProducer_F sequenceProducer
++);
++
++/*! ZSTD_CCtxParams_registerSequenceProducer() :
++ * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params.
++ * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(),
++ * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx().
++ *
++ * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx()
++ * is required, then this function is for you. Otherwise, you probably don't need it.
++ *
++ * See tests/zstreamtest.c for example usage. */
++ZSTDLIB_STATIC_API void
++ZSTD_CCtxParams_registerSequenceProducer(
++  ZSTD_CCtx_params* params,
++  void* sequenceProducerState,
++  ZSTD_sequenceProducer_F sequenceProducer
 +);
 +
 +
@@ -10401,7 +15540,12 @@ index 79d55465d5c1..8b4ffe649df5 100644
  ********************************************************************* */
  
  /*
-@@ -2362,7 +2795,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+@@ -2358,11 +2860,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ 
+   A ZSTD_CCtx object is required to track streaming operations.
+   Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+-  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
++  ZSTD_CCtx object can be reused multiple times within successive compression operations.
  
    Start by initializing a context.
    Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
@@ -10409,7 +15553,12 @@ index 79d55465d5c1..8b4ffe649df5 100644
  
    Then, consume your input using ZSTD_compressContinue().
    There are some important considerations to keep in mind when using this advanced function :
-@@ -2384,18 +2816,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+@@ -2380,36 +2881,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+   It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+   Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+ 
+-  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
++  `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again.
  */
  
  /*=====   Buffer-less streaming compression functions  =====*/
@@ -10439,7 +15588,13 @@ index 79d55465d5c1..8b4ffe649df5 100644
  size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
  /*
    Buffer-less streaming decompression (synchronous mode)
-@@ -2408,8 +2850,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+ 
+   A ZSTD_DCtx object is required to track streaming operations.
+   Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+-  A ZSTD_DCtx object can be re-used multiple times.
++  A ZSTD_DCtx object can be reused multiple times.
+ 
+   First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
    Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
    Data fragment must be large enough to ensure successful decoding.
   `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
@@ -10450,7 +15605,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
             errorCode, which can be tested using ZSTD_isError().
  
    It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
-@@ -2428,7 +2870,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+@@ -2428,7 +2939,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
  
    The most memory efficient way is to use a round buffer of sufficient size.
    Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
@@ -10459,7 +15614,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
    In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
    up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
    which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
-@@ -2448,7 +2890,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+@@ -2448,7 +2959,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
    ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
    ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
  
@@ -10468,7 +15623,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
    It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
    It can also be an error code, which can be tested with ZSTD_isError().
  
-@@ -2471,27 +2913,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
+@@ -2471,27 +2982,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
  */
  
  /*=====   Buffer-less streaming decompression functions  =====*/
@@ -10496,7 +15651,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
  ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
  
  ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
-@@ -2502,6 +2924,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+@@ -2502,6 +2993,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
  ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
  
  /* misc */
@@ -10504,7 +15659,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
  ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
  typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
  ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
-@@ -2509,11 +2932,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+@@ -2509,11 +3001,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
  
  
  
@@ -10531,7 +15686,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
      Block functions produce and decode raw zstd blocks, without frame metadata.
      Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
      But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
-@@ -2524,7 +2959,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+@@ -2524,7 +3028,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
      - It is necessary to init context before starting
        + compression : any ZSTD_compressBegin*() variant, including with dictionary
        + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
@@ -10539,7 +15694,7 @@ index 79d55465d5c1..8b4ffe649df5 100644
      - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
        + If input is larger than a block size, it's necessary to split input data into multiple blocks
        + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
-@@ -2541,11 +2975,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+@@ -2541,11 +3044,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
  */
  
  /*=====   Raw zstd block functions  =====*/
@@ -10569,7 +15724,7 @@ index 20f08c644b71..464c410b2768 100644
  # This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h
 new file mode 100644
-index 000000000000..05adbbeccaa9
+index 000000000000..16c3d08e8d1a
 --- /dev/null
 +++ b/lib/zstd/common/allocations.h
 @@ -0,0 +1,56 @@
@@ -10590,7 +15745,7 @@ index 000000000000..05adbbeccaa9
 +#define ZSTD_DEPS_NEED_MALLOC
 +#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
 +
-+#include "mem.h" /* MEM_STATIC */
++#include "compiler.h" /* MEM_STATIC */
 +#define ZSTD_STATIC_LINKING_ONLY
 +#include <linux/zstd.h> /* ZSTD_customMem */
 +
@@ -10785,7 +15940,7 @@ index 000000000000..aa3487ec4b6a
 +
 +#endif /* ZSTD_BITS_H */
 diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
-index feef3a1b1d60..444dc4f85c64 100644
+index feef3a1b1d60..6a13f1f0f1e8 100644
 --- a/lib/zstd/common/bitstream.h
 +++ b/lib/zstd/common/bitstream.h
 @@ -1,7 +1,8 @@
@@ -10806,7 +15961,43 @@ index feef3a1b1d60..444dc4f85c64 100644
  
  
  /*=========================================
-@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+@@ -79,19 +81,20 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+ /*-********************************************
+ *  bitStream decoding API (read backward)
+ **********************************************/
++typedef size_t BitContainerType;
+ typedef struct {
+-    size_t   bitContainer;
++    BitContainerType bitContainer;
+     unsigned bitsConsumed;
+     const char* ptr;
+     const char* start;
+     const char* limitPtr;
+ } BIT_DStream_t;
+ 
+-typedef enum { BIT_DStream_unfinished = 0,
+-               BIT_DStream_endOfBuffer = 1,
+-               BIT_DStream_completed = 2,
+-               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+-               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
++typedef enum { BIT_DStream_unfinished = 0,  /* fully refilled */
++               BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */
++               BIT_DStream_completed = 2,   /* bitstream entirely consumed, bit-exact */
++               BIT_DStream_overflow = 3     /* user requested more bits than present in bitstream */
++    } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+ 
+ MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+ MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+@@ -101,7 +104,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+ 
+ /* Start by invoking BIT_initDStream().
+ *  A chunk of the bitStream is then stored into a local register.
+-*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
++*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType).
+ *  You can then retrieve bitFields stored into the local register, **in reverse order**.
+ *  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+ *  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+@@ -122,33 +125,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
  MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
  /* faster, but works only if nbBits >= 1 */
  
@@ -10840,11 +16031,11 @@ index feef3a1b1d60..444dc4f85c64 100644
  /*=====    Local Constants   =====*/
  static const unsigned BIT_mask[] = {
      0,          1,         3,         7,         0xF,       0x1F,
-@@ -178,6 +153,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+@@ -178,6 +154,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
      return 0;
  }
  
-+MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
++FORCE_INLINE_TEMPLATE size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
 +{
 +    assert(nbBits < BIT_MASK_SIZE);
 +    return bitContainer & BIT_mask[nbBits];
@@ -10853,7 +16044,7 @@ index feef3a1b1d60..444dc4f85c64 100644
  /*! BIT_addBits() :
   *  can add up to 31 bits into `bitC`.
   *  Note : does not check for register overflow ! */
-@@ -187,7 +168,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+@@ -187,7 +169,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
      DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
      assert(nbBits < BIT_MASK_SIZE);
      assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
@@ -10862,7 +16053,7 @@ index feef3a1b1d60..444dc4f85c64 100644
      bitC->bitPos += nbBits;
  }
  
-@@ -266,7 +247,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+@@ -266,35 +248,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
          bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
          bitD->bitContainer = MEM_readLEST(bitD->ptr);
          { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
@@ -10871,7 +16062,33 @@ index feef3a1b1d60..444dc4f85c64 100644
            if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
      } else {
          bitD->ptr   = bitD->start;
-@@ -294,7 +275,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+         bitD->bitContainer = *(const BYTE*)(bitD->start);
+         switch(srcSize)
+         {
+-        case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
++        case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
++        case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
++        case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
++        case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24;
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
++        case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16;
+                 ZSTD_FALLTHROUGH;
+ 
+-        case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
++        case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) <<  8;
+                 ZSTD_FALLTHROUGH;
+ 
          default: break;
          }
          {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
@@ -10880,7 +16097,22 @@ index feef3a1b1d60..444dc4f85c64 100644
              if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
          }
          bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
-@@ -325,12 +306,6 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
+@@ -303,12 +285,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
+     return srcSize;
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
++FORCE_INLINE_TEMPLATE size_t BIT_getUpperBits(BitContainerType bitContainer, U32 const start)
+ {
+     return bitContainer >> start;
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
++FORCE_INLINE_TEMPLATE size_t BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits)
+ {
+     U32 const regMask = sizeof(bitContainer)*8 - 1;
+     /* if start > regMask, bitstream is corrupted, and result is undefined */
+@@ -325,19 +307,13 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
  #endif
  }
  
@@ -10893,7 +16125,33 @@ index feef3a1b1d60..444dc4f85c64 100644
  /*! BIT_lookBits() :
   *  Provides next n bits from local register.
   *  local register is not modified.
-@@ -377,7 +352,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n
+  *  On 32-bits, maxNbBits==24.
+  *  On 64-bits, maxNbBits==56.
+  * @return : value extracted */
+-MEM_STATIC  FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
++FORCE_INLINE_TEMPLATE size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
+ {
+     /* arbitrate between double-shift and shift+mask */
+ #if 1
+@@ -360,7 +336,7 @@ MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
+     return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
+ }
+ 
+-MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
++FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+ {
+     bitD->bitsConsumed += nbBits;
+ }
+@@ -369,7 +345,7 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+  *  Read (consume) next n bits from local register and update.
+  *  Pay attention to not read more than nbBits contained into local register.
+  * @return : extracted value. */
+-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
++FORCE_INLINE_TEMPLATE size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
+ {
+     size_t const value = BIT_lookBits(bitD, nbBits);
+     BIT_skipBits(bitD, nbBits);
+@@ -377,7 +353,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n
  }
  
  /*! BIT_readBitsFast() :
@@ -10902,17 +16160,76 @@ index feef3a1b1d60..444dc4f85c64 100644
  MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
  {
      size_t const value = BIT_lookBitsFast(bitD, nbBits);
-@@ -408,7 +383,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
-  *  This function is safe, it guarantees it will not read beyond src buffer.
+@@ -386,6 +362,21 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+     return value;
+ }
+ 
++/*! BIT_reloadDStream_internal() :
++ *  Simple variant of BIT_reloadDStream(), with two conditions:
++ *  1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8
++ *  2. look window is valid after shifted down : bitD->ptr >= bitD->start
++ */
++MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD)
++{
++    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
++    bitD->ptr -= bitD->bitsConsumed >> 3;
++    assert(bitD->ptr >= bitD->start);
++    bitD->bitsConsumed &= 7;
++    bitD->bitContainer = MEM_readLEST(bitD->ptr);
++    return BIT_DStream_unfinished;
++}
++
+ /*! BIT_reloadDStreamFast() :
+  *  Similar to BIT_reloadDStream(), but with two differences:
+  *  1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
+@@ -396,31 +387,35 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
+ {
+     if (UNLIKELY(bitD->ptr < bitD->limitPtr))
+         return BIT_DStream_overflow;
+-    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
+-    bitD->ptr -= bitD->bitsConsumed >> 3;
+-    bitD->bitsConsumed &= 7;
+-    bitD->bitContainer = MEM_readLEST(bitD->ptr);
+-    return BIT_DStream_unfinished;
++    return BIT_reloadDStream_internal(bitD);
+ }
+ 
+ /*! BIT_reloadDStream() :
+  *  Refill `bitD` from buffer previously set in BIT_initDStream() .
+- *  This function is safe, it guarantees it will not read beyond src buffer.
++ *  This function is safe, it guarantees it will not never beyond src buffer.
   * @return : status of `BIT_DStream_t` internal register.
   *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
 -MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
-+MEM_STATIC FORCE_INLINE_ATTR BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
++FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
  {
-     if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
+-    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
++    /* note : once in overflow mode, a bitstream remains in this mode until it's reset */
++    if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) {
++        static const BitContainerType zeroFilled = 0;
++        bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */
++        /* overflow detected, erroneous scenario or end of stream: no update */
          return BIT_DStream_overflow;
++    }
++
++    assert(bitD->ptr >= bitD->start);
+ 
+     if (bitD->ptr >= bitD->limitPtr) {
+-        return BIT_reloadDStreamFast(bitD);
++        return BIT_reloadDStream_internal(bitD);
+     }
+     if (bitD->ptr == bitD->start) {
++        /* reached end of bitStream => no update */
+         if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+         return BIT_DStream_completed;
+     }
+-    /* start < ptr < limitPtr */
++    /* start < ptr < limitPtr => cautious update */
+     {   U32 nbBytes = bitD->bitsConsumed >> 3;
+         BIT_DStream_status result = BIT_DStream_unfinished;
+         if (bitD->ptr - nbBytes < bitD->start) {
 diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
-index c42d39faf9bd..c437e0975575 100644
+index c42d39faf9bd..508ee25537bb 100644
 --- a/lib/zstd/common/compiler.h
 +++ b/lib/zstd/common/compiler.h
 @@ -1,5 +1,6 @@
@@ -10923,10 +16240,178 @@ index c42d39faf9bd..c437e0975575 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
-@@ -179,6 +180,17 @@
+@@ -11,6 +12,8 @@
+ #ifndef ZSTD_COMPILER_H
+ #define ZSTD_COMPILER_H
+ 
++#include <linux/types.h>
++
+ #include "portability_macros.h"
+ 
+ /*-*******************************************************
+@@ -41,12 +44,15 @@
+ */
+ #define WIN_CDECL
+ 
++/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
++#define UNUSED_ATTR __attribute__((unused))
++
+ /*
+  * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
+  * parameters. They must be inlined for the compiler to eliminate the constant
+  * branches.
+  */
+-#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
++#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR
+ /*
+  * HINT_INLINE is used to help the compiler generate better code. It is *not*
+  * used for "templates", so it can be tweaked based on the compilers
+@@ -61,11 +67,21 @@
+ #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
+ #  define HINT_INLINE static INLINE_KEYWORD
+ #else
+-#  define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
++#  define HINT_INLINE FORCE_INLINE_TEMPLATE
+ #endif
+ 
+-/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
+-#define UNUSED_ATTR __attribute__((unused))
++/* "soft" inline :
++ * The compiler is free to select if it's a good idea to inline or not.
++ * The main objective is to silence compiler warnings
++ * when a defined function in included but not used.
++ *
++ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit.
++ * Updating the prefix is probably preferable, but requires a fairly large codemod,
++ * since this name is used everywhere.
++ */
++#ifndef MEM_STATIC  /* already defined in Linux Kernel mem.h */
++#define MEM_STATIC static __inline UNUSED_ATTR
++#endif
+ 
+ /* force no inlining */
+ #define FORCE_NOINLINE static __attribute__((__noinline__))
+@@ -86,23 +102,24 @@
+ #  define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+ #  define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+ #elif defined(__aarch64__)
+-#  define PREFETCH_L1(ptr)  __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
+-#  define PREFETCH_L2(ptr)  __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
++#  define PREFETCH_L1(ptr)  do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
++#  define PREFETCH_L2(ptr)  do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
+ #else
+-#  define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
+-#  define PREFETCH_L2(ptr) (void)(ptr)  /* disabled */
++#  define PREFETCH_L1(ptr) do { (void)(ptr); } while (0)  /* disabled */
++#  define PREFETCH_L2(ptr) do { (void)(ptr); } while (0)  /* disabled */
+ #endif  /* NO_PREFETCH */
+ 
+ #define CACHELINE_SIZE 64
+ 
+-#define PREFETCH_AREA(p, s)  {            \
+-    const char* const _ptr = (const char*)(p);  \
+-    size_t const _size = (size_t)(s);     \
+-    size_t _pos;                          \
+-    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+-        PREFETCH_L2(_ptr + _pos);         \
+-    }                                     \
+-}
++#define PREFETCH_AREA(p, s)                              \
++    do {                                                 \
++        const char* const _ptr = (const char*)(p);       \
++        size_t const _size = (size_t)(s);                \
++        size_t _pos;                                     \
++        for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \
++            PREFETCH_L2(_ptr + _pos);                    \
++        }                                                \
++    } while (0)
+ 
+ /* vectorization
+  * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax,
+@@ -126,9 +143,9 @@
+ #define UNLIKELY(x) (__builtin_expect((x), 0))
+ 
+ #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
+-#  define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); }
++#  define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0)
+ #else
+-#  define ZSTD_UNREACHABLE { assert(0); }
++#  define ZSTD_UNREACHABLE do { assert(0); } while (0)
+ #endif
+ 
+ /* disable warnings */
+@@ -179,6 +196,85 @@
  *  Sanitizer
  *****************************************************************/
  
++/*
++ * Zstd relies on pointer overflow in its decompressor.
++ * We add this attribute to functions that rely on pointer overflow.
++ */
++#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++#  if __has_attribute(no_sanitize)
++#    if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8
++       /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */
++#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow")))
++#    else
++       /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */
++#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow")))
++#    endif
++#  else
++#    define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++#  endif
++#endif
++
++/*
++ * Helper function to perform a wrapped pointer difference without trigging
++ * UBSAN.
++ *
++ * @returns lhs - rhs with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs)
++{
++    return lhs - rhs;
++}
++
++/*
++ * Helper function to perform a wrapped pointer add without triggering UBSAN.
++ *
++ * @return ptr + add with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add)
++{
++    return ptr + add;
++}
++
++/*
++ * Helper function to perform a wrapped pointer subtraction without triggering
++ * UBSAN.
++ *
++ * @return ptr - sub with wrapping
++ */
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub)
++{
++    return ptr - sub;
++}
++
++/*
++ * Helper function to add to a pointer that works around C's undefined behavior
++ * of adding 0 to NULL.
++ *
++ * @returns `ptr + add` except it defines `NULL + 0 == NULL`.
++ */
++MEM_STATIC
++unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add)
++{
++    return add > 0 ? ptr + add : ptr;
++}
++
 +/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
 + * abundance of caution, disable our custom poisoning on mingw. */
 +#ifdef __MINGW32__
@@ -10954,7 +16439,7 @@ index 0db7b42407ee..d8319a2bef4c 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
-index bb863c9ea616..d77926cbad14 100644
+index bb863c9ea616..8eb6aa9a3b20 100644
 --- a/lib/zstd/common/debug.c
 +++ b/lib/zstd/common/debug.c
 @@ -1,7 +1,8 @@
@@ -10967,15 +16452,19 @@ index bb863c9ea616..d77926cbad14 100644
   *
   * You can contact the author at :
   * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
-@@ -21,4 +22,6 @@
+@@ -21,4 +22,10 @@
  
  #include "debug.h"
  
 +#if (DEBUGLEVEL>=2)
++/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a
++ * translation unit is empty. So remove this from Linux kernel builds, but
++ * otherwise just leave it in.
++ */
  int g_debuglevel = DEBUGLEVEL;
 +#endif
 diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h
-index 6dd88d1fbd02..da0dbfc614b8 100644
+index 6dd88d1fbd02..226ba3c57ec3 100644
 --- a/lib/zstd/common/debug.h
 +++ b/lib/zstd/common/debug.h
 @@ -1,7 +1,8 @@
@@ -10988,6 +16477,45 @@ index 6dd88d1fbd02..da0dbfc614b8 100644
   *
   * You can contact the author at :
   * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+@@ -82,18 +83,27 @@ extern int g_debuglevel; /* the variable is only declared,
+                             It's useful when enabling very verbose levels
+                             on selective conditions (such as position in src) */
+ 
+-#  define RAWLOG(l, ...) {                                       \
+-                if (l<=g_debuglevel) {                           \
+-                    ZSTD_DEBUG_PRINT(__VA_ARGS__);               \
+-            }   }
+-#  define DEBUGLOG(l, ...) {                                     \
+-                if (l<=g_debuglevel) {                           \
+-                    ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \
+-                    ZSTD_DEBUG_PRINT(" \n");                     \
+-            }   }
++#  define RAWLOG(l, ...)                   \
++    do {                                   \
++        if (l<=g_debuglevel) {             \
++            ZSTD_DEBUG_PRINT(__VA_ARGS__); \
++        }                                  \
++    } while (0)
++
++#define STRINGIFY(x) #x
++#define TOSTRING(x) STRINGIFY(x)
++#define LINE_AS_STRING TOSTRING(__LINE__)
++
++#  define DEBUGLOG(l, ...)                               \
++    do {                                                 \
++        if (l<=g_debuglevel) {                           \
++            ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \
++            ZSTD_DEBUG_PRINT(" \n");                     \
++        }                                                \
++    } while (0)
+ #else
+-#  define RAWLOG(l, ...)      {}    /* disabled */
+-#  define DEBUGLOG(l, ...)    {}    /* disabled */
++#  define RAWLOG(l, ...)   do { } while (0)    /* disabled */
++#  define DEBUGLOG(l, ...) do { } while (0)    /* disabled */
+ #endif
+ 
+ 
 diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
 index fef67056f052..6cdd82233fb5 100644
 --- a/lib/zstd/common/entropy_common.c
@@ -11155,7 +16683,7 @@ index 6d1135f8c373..a4062d30d170 100644
      default: return notErrorCode;
      }
 diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h
-index ca5101e542fa..9a4699a38a88 100644
+index ca5101e542fa..0410ca415b54 100644
 --- a/lib/zstd/common/error_private.h
 +++ b/lib/zstd/common/error_private.h
 @@ -1,5 +1,6 @@
@@ -11166,8 +16694,122 @@ index ca5101e542fa..9a4699a38a88 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
+@@ -49,8 +50,13 @@ ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+ ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+ 
+ /* check and forward error code */
+-#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+-#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
++#define CHECK_V_F(e, f)     \
++    size_t const e = f;     \
++    do {                    \
++        if (ERR_isError(e)) \
++            return e;       \
++    } while (0)
++#define CHECK_F(f)   do { CHECK_V_F(_var_err__, f); } while (0)
+ 
+ 
+ /*-****************************************
+@@ -84,10 +90,12 @@ void _force_has_format_string(const char *format, ...) {
+  * We want to force this function invocation to be syntactically correct, but
+  * we don't want to force runtime evaluation of its arguments.
+  */
+-#define _FORCE_HAS_FORMAT_STRING(...) \
+-  if (0) { \
+-    _force_has_format_string(__VA_ARGS__); \
+-  }
++#define _FORCE_HAS_FORMAT_STRING(...)              \
++    do {                                           \
++        if (0) {                                   \
++            _force_has_format_string(__VA_ARGS__); \
++        }                                          \
++    } while (0)
+ 
+ #define ERR_QUOTE(str) #str
+ 
+@@ -98,48 +106,50 @@ void _force_has_format_string(const char *format, ...) {
+  * In order to do that (particularly, printing the conditional that failed),
+  * this can't just wrap RETURN_ERROR().
+  */
+-#define RETURN_ERROR_IF(cond, err, ...) \
+-  if (cond) { \
+-    RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
+-           __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
+-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-    RAWLOG(3, ": " __VA_ARGS__); \
+-    RAWLOG(3, "\n"); \
+-    return ERROR(err); \
+-  }
++#define RETURN_ERROR_IF(cond, err, ...)                                        \
++    do {                                                                       \
++        if (cond) {                                                            \
++            RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s",          \
++                  __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
++            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                             \
++            RAWLOG(3, ": " __VA_ARGS__);                                       \
++            RAWLOG(3, "\n");                                                   \
++            return ERROR(err);                                                 \
++        }                                                                      \
++    } while (0)
+ 
+ /*
+  * Unconditionally return the specified error.
+  *
+  * In debug modes, prints additional information.
+  */
+-#define RETURN_ERROR(err, ...) \
+-  do { \
+-    RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
+-           __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \
+-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-    RAWLOG(3, ": " __VA_ARGS__); \
+-    RAWLOG(3, "\n"); \
+-    return ERROR(err); \
+-  } while(0);
++#define RETURN_ERROR(err, ...)                                               \
++    do {                                                                     \
++        RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
++              __FILE__, __LINE__, ERR_QUOTE(ERROR(err)));                    \
++        _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                               \
++        RAWLOG(3, ": " __VA_ARGS__);                                         \
++        RAWLOG(3, "\n");                                                     \
++        return ERROR(err);                                                   \
++    } while(0)
+ 
+ /*
+  * If the provided expression evaluates to an error code, returns that error code.
+  *
+  * In debug modes, prints additional information.
+  */
+-#define FORWARD_IF_ERROR(err, ...) \
+-  do { \
+-    size_t const err_code = (err); \
+-    if (ERR_isError(err_code)) { \
+-      RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
+-             __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
+-      _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+-      RAWLOG(3, ": " __VA_ARGS__); \
+-      RAWLOG(3, "\n"); \
+-      return err_code; \
+-    } \
+-  } while(0);
++#define FORWARD_IF_ERROR(err, ...)                                                 \
++    do {                                                                           \
++        size_t const err_code = (err);                                             \
++        if (ERR_isError(err_code)) {                                               \
++            RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s",                 \
++                  __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
++            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                                 \
++            RAWLOG(3, ": " __VA_ARGS__);                                           \
++            RAWLOG(3, "\n");                                                       \
++            return err_code;                                                       \
++        }                                                                          \
++    } while(0)
+ 
+ 
+ #endif /* ERROR_H_MODULE */
 diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
-index 4507043b2287..c4e25a219142 100644
+index 4507043b2287..2185a578617d 100644
 --- a/lib/zstd/common/fse.h
 +++ b/lib/zstd/common/fse.h
 @@ -1,7 +1,8 @@
@@ -11269,7 +16911,15 @@ index 4507043b2287..c4e25a219142 100644
  
  /*!
  Tutorial :
-@@ -317,16 +258,6 @@ If there is an error, the function will return an error code, which can be teste
+@@ -286,6 +227,7 @@ If there is an error, the function will return an error code, which can be teste
+ 
+ #endif  /* FSE_H */
+ 
++
+ #if !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
+ #define FSE_H_FSE_STATIC_LINKING_ONLY
+ 
+@@ -317,16 +259,6 @@ If there is an error, the function will return an error code, which can be teste
  unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
  /*< same as FSE_optimalTableLog(), which used `minus==2` */
  
@@ -11286,7 +16936,7 @@ index 4507043b2287..c4e25a219142 100644
  size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
  /*< build a fake FSE_CTable, designed to compress always the same symbolValue */
  
-@@ -344,19 +275,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
+@@ -344,19 +276,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
  FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
  /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
  
@@ -11309,7 +16959,22 @@ index 4507043b2287..c4e25a219142 100644
  
  typedef enum {
     FSE_repeat_none,  /*< Cannot use the previous table */
-@@ -552,7 +475,7 @@ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePt
+@@ -539,20 +463,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, un
+     FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+     const U16* const stateTable = (const U16*)(statePtr->stateTable);
+     U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+-    BIT_addBits(bitC, statePtr->value, nbBitsOut);
++    BIT_addBits(bitC,  (size_t)statePtr->value, nbBitsOut);
+     statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+ }
+ 
+ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+ {
+-    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
++    BIT_addBits(bitC, (size_t)statePtr->value, statePtr->stateLog);
+     BIT_flushBits(bitC);
+ }
+ 
  
  /* FSE_getMaxNbBits() :
   * Approximate maximum cost of a symbol, in bits.
@@ -11319,7 +16984,7 @@ index 4507043b2287..c4e25a219142 100644
   * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
  MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
 diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
-index 8dcb8ca39767..99ce8fa54d08 100644
+index 8dcb8ca39767..3a17e84f27bf 100644
 --- a/lib/zstd/common/fse_decompress.c
 +++ b/lib/zstd/common/fse_decompress.c
 @@ -1,6 +1,7 @@
@@ -11331,15 +16996,18 @@ index 8dcb8ca39767..99ce8fa54d08 100644
   *
   *  You can contact the author at :
   *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
-@@ -24,6 +25,7 @@
+@@ -22,8 +23,8 @@
+ #define FSE_STATIC_LINKING_ONLY
+ #include "fse.h"
  #include "error_private.h"
- #define ZSTD_DEPS_NEED_MALLOC
- #include "zstd_deps.h"
+-#define ZSTD_DEPS_NEED_MALLOC
+-#include "zstd_deps.h"
++#include "zstd_deps.h"  /* ZSTD_memcpy */
 +#include "bits.h"       /* ZSTD_highbit32 */
  
  
  /* **************************************************************
-@@ -55,19 +57,6 @@
+@@ -55,19 +56,6 @@
  #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
  #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
  
@@ -11359,9 +17027,34 @@ index 8dcb8ca39767..99ce8fa54d08 100644
  static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
  {
      void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
-@@ -127,10 +116,10 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
-             }
-         }
+@@ -96,7 +84,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+                     symbolNext[s] = 1;
+                 } else {
+                     if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+-                    symbolNext[s] = normalizedCounter[s];
++                    symbolNext[s] = (U16)normalizedCounter[s];
+         }   }   }
+         ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
+     }
+@@ -111,8 +99,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+          * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
+          * our buffer to handle the over-write.
+          */
+-        {
+-            U64 const add = 0x0101010101010101ull;
++        {   U64 const add = 0x0101010101010101ull;
+             size_t pos = 0;
+             U64 sv = 0;
+             U32 s;
+@@ -123,14 +110,13 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+                 for (i = 8; i < n; i += 8) {
+                     MEM_write64(spread + pos + i, sv);
+                 }
+-                pos += n;
+-            }
+-        }
++                pos += (size_t)n;
++        }   }
          /* Now we spread those positions across the table.
 -         * The benefit of doing it in two stages is that we avoid the the
 +         * The benefit of doing it in two stages is that we avoid the
@@ -11372,7 +17065,7 @@ index 8dcb8ca39767..99ce8fa54d08 100644
           */
          {
              size_t position = 0;
-@@ -166,7 +155,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
+@@ -166,7 +152,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
          for (u=0; u<tableSize; u++) {
              FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
              U32 const nextState = symbolNext[symbol]++;
@@ -11381,7 +17074,7 @@ index 8dcb8ca39767..99ce8fa54d08 100644
              tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
      }   }
  
-@@ -184,49 +173,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
+@@ -184,49 +170,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
  /*-*******************************************************
  *  Decompression (Byte symbols)
  *********************************************************/
@@ -11431,10 +17124,13 @@ index 8dcb8ca39767..99ce8fa54d08 100644
  
  FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
            void* dst, size_t maxDstSize,
-@@ -290,26 +236,6 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
-     return op-ostart;
- }
+@@ -287,32 +230,12 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+             break;
+     }   }
  
+-    return op-ostart;
+-}
+-
 -
 -size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
 -                            const void* cSrc, size_t cSrcSize,
@@ -11453,12 +17149,39 @@ index 8dcb8ca39767..99ce8fa54d08 100644
 -size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
 -{
 -    return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
--}
--
++    assert(op >= ostart);
++    return (size_t)(op-ostart);
+ }
+ 
  typedef struct {
      short ncount[FSE_MAX_SYMBOL_VALUE + 1];
-     FSE_DTable dtable[]; /* Dynamically sized */
-@@ -342,7 +268,8 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+-    FSE_DTable dtable[]; /* Dynamically sized */
+ } FSE_DecompressWksp;
+ 
+ 
+@@ -327,13 +250,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
+     unsigned tableLog;
+     unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+     FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace;
++    size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable);
++    FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos;
+ 
+-    DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
++    FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
+     if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC);
+ 
++    /* correct offset to dtable depends on this property */
++    FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0);
++
+     /* normal FSE decoding mode */
+-    {
+-        size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
++    {   size_t const NCountLength =
++            FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
+         if (FSE_isError(NCountLength)) return NCountLength;
+         if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+         assert(NCountLength <= cSrcSize);
+@@ -342,19 +270,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
      }
  
      if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
@@ -11467,8 +17190,24 @@ index 8dcb8ca39767..99ce8fa54d08 100644
 +    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
      wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
  
-     CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
-@@ -382,9 +309,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
+-    CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
++    CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
+ 
+     {
+-        const void* ptr = wksp->dtable;
++        const void* ptr = dtable;
+         const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+         const U32 fastMode = DTableH->fastMode;
+ 
+         /* select fast mode (static) */
+-        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1);
+-        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0);
++        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1);
++        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0);
+     }
+ }
+ 
+@@ -382,9 +311,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
      return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
  }
  
@@ -11479,7 +17218,7 @@ index 8dcb8ca39767..99ce8fa54d08 100644
 -
  #endif   /* FSE_COMMONDEFS_ONLY */
 diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
-index 5042ff870308..8e7943092ed1 100644
+index 5042ff870308..57462466e188 100644
 --- a/lib/zstd/common/huf.h
 +++ b/lib/zstd/common/huf.h
 @@ -1,7 +1,8 @@
@@ -11555,12 +17294,12 @@ index 5042ff870308..8e7943092ed1 100644
  /* Error Management */
 -HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
 -HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
+-
 +unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
 +const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
  
- 
 -/* ***   Advanced function   *** */
--
+ 
 -/* HUF_compress2() :
 - *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
 - * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
@@ -11713,7 +17452,31 @@ index 5042ff870308..8e7943092ed1 100644
  
  /* HUF_readCTable() :
   *  Loading a CTable saved with HUF_writeCTable() */
-@@ -276,32 +225,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+@@ -246,9 +195,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+ 
+ /* HUF_getNbBitsFromCTable() :
+  *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
+- *  Note 1 : is not inlined, as HUF_CElt definition is private */
++ *  Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0
++ *  Note 2 : is not inlined, as HUF_CElt definition is private
++ */
+ U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
+ 
++typedef struct {
++    BYTE tableLog;
++    BYTE maxSymbolValue;
++    BYTE unused[sizeof(size_t) - 2];
++} HUF_CTableHeader;
++
++/* HUF_readCTableHeader() :
++ *  @returns The header from the CTable specifying the tableLog and the maxSymbolValue.
++ */
++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable);
++
+ /*
+  * HUF_decompress() does the following:
+  * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
+@@ -276,32 +238,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
  #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
  #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
  
@@ -11747,12 +17510,13 @@ index 5042ff870308..8e7943092ed1 100644
  /* HUF_compress1X_repeat() :
   *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
   *  If it uses hufTable it does not modify hufTable or repeat.
-@@ -312,47 +241,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+@@ -312,47 +254,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
                         const void* src, size_t srcSize,
                         unsigned maxSymbolValue, unsigned tableLog,
                         void* workSpace, size_t wkspSize,   /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
 -                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
--
++                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
+ 
 -size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
 -#ifndef HUF_FORCE_DECOMPRESS_X1
 -size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
@@ -11764,18 +17528,17 @@ index 5042ff870308..8e7943092ed1 100644
 -size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
 -size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
 -#endif
--#ifndef HUF_FORCE_DECOMPRESS_X1
++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
+ #ifndef HUF_FORCE_DECOMPRESS_X1
 -size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
 -size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
 -#endif
-+                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
- 
+-
 -size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /*< automatic selection of sing or double symbol decoder, based on DTable */
 -#ifndef HUF_FORCE_DECOMPRESS_X2
 -size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
 -#endif
-+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
- #ifndef HUF_FORCE_DECOMPRESS_X1
+-#ifndef HUF_FORCE_DECOMPRESS_X1
 -size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
 +size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /*< double-symbols decoder */
  #endif
@@ -11806,7 +17569,7 @@ index 5042ff870308..8e7943092ed1 100644
 +#endif   /* HUF_H_298734234 */
  
 diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
-index 1d9cc03924ca..a7231822b6e3 100644
+index 1d9cc03924ca..2e91e7780c1f 100644
 --- a/lib/zstd/common/mem.h
 +++ b/lib/zstd/common/mem.h
 @@ -1,6 +1,6 @@
@@ -11817,8 +17580,16 @@ index 1d9cc03924ca..a7231822b6e3 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
+@@ -24,6 +24,7 @@
+ /*-****************************************
+ *  Compiler specifics
+ ******************************************/
++#undef MEM_STATIC /* may be already defined from common/compiler.h */
+ #define MEM_STATIC static inline
+ 
+ /*-**************************************************************
 diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h
-index 0e3b2c0a527d..7ede8cf1ffe5 100644
+index 0e3b2c0a527d..f08638cced6c 100644
 --- a/lib/zstd/common/portability_macros.h
 +++ b/lib/zstd/common/portability_macros.h
 @@ -1,5 +1,6 @@
@@ -11838,7 +17609,16 @@ index 0e3b2c0a527d..7ede8cf1ffe5 100644
   * This header is shared between C and ASM code, so it MUST only
   * contain macro definitions. It MUST not contain any C code.
   *
-@@ -65,7 +66,7 @@
+@@ -45,6 +46,8 @@
+ /* Mark the internal assembly functions as hidden  */
+ #ifdef __ELF__
+ # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func
++#elif defined(__APPLE__)
++# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func
+ #else
+ # define ZSTD_HIDE_ASM_FUNCTION(func)
+ #endif
+@@ -65,7 +68,7 @@
  #endif
  
  /*
@@ -11847,7 +17627,7 @@ index 0e3b2c0a527d..7ede8cf1ffe5 100644
   * because other platforms may not support GAS assembly syntax.
   *
   * Only enable assembly for Linux / MacOS, other platforms may
-@@ -90,4 +91,23 @@
+@@ -90,4 +93,23 @@
   */
  #define ZSTD_ENABLE_ASM_X86_64_BMI2 0
  
@@ -11960,7 +17740,7 @@ index 2c34e8a33a1c..f931f7d0e294 100644
 +#endif /* ZSTD_DEPS_STDINT */
 +#endif /* ZSTD_DEPS_NEED_STDINT */
 diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
-index 93305d9b41bb..7f023e4d4774 100644
+index 93305d9b41bb..11da1233e890 100644
 --- a/lib/zstd/common/zstd_internal.h
 +++ b/lib/zstd/common/zstd_internal.h
 @@ -1,5 +1,6 @@
@@ -12008,6 +17788,33 @@ index 93305d9b41bb..7f023e4d4774 100644
  
  #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
  /* Each table cannot take more than #symbols * FSELog bits */
+@@ -166,7 +169,7 @@ static void ZSTD_copy8(void* dst, const void* src) {
+     ZSTD_memcpy(dst, src, 8);
+ #endif
+ }
+-#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
++#define COPY8(d,s) do { ZSTD_copy8(d,s); d+=8; s+=8; } while (0)
+ 
+ /* Need to use memmove here since the literal buffer can now be located within
+    the dst buffer. In circumstances where the op "catches up" to where the
+@@ -186,7 +189,7 @@ static void ZSTD_copy16(void* dst, const void* src) {
+     ZSTD_memcpy(dst, copy16_buf, 16);
+ #endif
+ }
+-#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
++#define COPY16(d,s) do { ZSTD_copy16(d,s); d+=16; s+=16; } while (0)
+ 
+ #define WILDCOPY_OVERLENGTH 32
+ #define WILDCOPY_VECLEN 16
+@@ -215,7 +218,7 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
+     if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
+         /* Handle short offset copies. */
+         do {
+-            COPY8(op, ip)
++            COPY8(op, ip);
+         } while (op < oend);
+     } else {
+         assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
 @@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
           * one COPY16() in the first call. Then, do two calls per loop since
           * at that point it is more likely to have a high trip count.
@@ -12145,6 +17952,22 @@ index 93305d9b41bb..7f023e4d4774 100644
  
  
  /* ZSTD_invalidateRepCodes() :
+@@ -420,13 +357,13 @@ typedef struct {
+ 
+ /*! ZSTD_getcBlockSize() :
+  *  Provides the size of compressed block from block header `src` */
+-/* Used by: decompress, fullbench (does not get its definition from here) */
++/*  Used by: decompress, fullbench */
+ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                           blockProperties_t* bpPtr);
+ 
+ /*! ZSTD_decodeSeqHeaders() :
+  *  decode sequence header from src */
+-/* Used by: decompress, fullbench (does not get its definition from here) */
++/*  Used by: zstd_decompress_block, fullbench */
+ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                        const void* src, size_t srcSize);
+ 
 diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
 index d9a76112ec3a..6ab8be6532ef 100644
 --- a/lib/zstd/compress/clevels.h
@@ -12158,7 +17981,7 @@ index d9a76112ec3a..6ab8be6532ef 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c
-index ec5b1ca6d71a..e46ca6621b48 100644
+index ec5b1ca6d71a..44a3c10becf2 100644
 --- a/lib/zstd/compress/fse_compress.c
 +++ b/lib/zstd/compress/fse_compress.c
 @@ -1,6 +1,7 @@
@@ -12170,10 +17993,12 @@ index ec5b1ca6d71a..e46ca6621b48 100644
   *
   *  You can contact the author at :
   *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
-@@ -26,6 +27,7 @@
+@@ -25,7 +26,8 @@
+ #include "../common/error_private.h"
  #define ZSTD_DEPS_NEED_MALLOC
  #define ZSTD_DEPS_NEED_MATH64
- #include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
+-#include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
++#include "../common/zstd_deps.h"  /* ZSTD_memset */
 +#include "../common/bits.h" /* ZSTD_highbit32 */
  
  
@@ -12196,7 +18021,55 @@ index ec5b1ca6d71a..e46ca6621b48 100644
                      U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
                      symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
                      symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
-@@ -342,21 +344,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+@@ -224,8 +226,8 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+     size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog
+                                    + 4 /* bitCount initialized at 4 */
+                                    + 2 /* first two symbols may use one additional bit each */) / 8)
+-                                    + 1 /* round up to whole nb bytes */
+-                                    + 2 /* additional two bytes for bitstream flush */;
++                                   + 1 /* round up to whole nb bytes */
++                                   + 2 /* additional two bytes for bitstream flush */;
+     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+ }
+ 
+@@ -254,7 +256,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+     /* Init */
+     remaining = tableSize+1;   /* +1 for extra accuracy */
+     threshold = tableSize;
+-    nbBits = tableLog+1;
++    nbBits = (int)tableLog+1;
+ 
+     while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+         if (previousIs0) {
+@@ -273,7 +275,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+             }
+             while (symbol >= start+3) {
+                 start+=3;
+-                bitStream += 3 << bitCount;
++                bitStream += 3U << bitCount;
+                 bitCount += 2;
+             }
+             bitStream += (symbol-start) << bitCount;
+@@ -293,7 +295,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+             count++;   /* +1 for extra accuracy */
+             if (count>=threshold)
+                 count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+-            bitStream += count << bitCount;
++            bitStream += (U32)count << bitCount;
+             bitCount  += nbBits;
+             bitCount  -= (count<max);
+             previousIs0  = (count==1);
+@@ -321,7 +323,8 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+     out[1] = (BYTE)(bitStream>>8);
+     out+= (bitCount+7) /8;
+ 
+-    return (out-ostart);
++    assert(out >= ostart);
++    return (size_t)(out-ostart);
+ }
+ 
+ 
+@@ -342,21 +345,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize,
  *  FSE Compression Code
  ****************************************************************/
  
@@ -12220,7 +18093,7 @@ index ec5b1ca6d71a..e46ca6621b48 100644
      U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
      assert(srcSize > 1); /* Not supported, RLE should be used instead */
      return minBits;
-@@ -364,7 +356,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+@@ -364,7 +357,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
  
  unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
  {
@@ -12229,7 +18102,7 @@ index ec5b1ca6d71a..e46ca6621b48 100644
      U32 tableLog = maxTableLog;
      U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
      assert(srcSize > 1); /* Not supported, RLE should be used instead */
-@@ -532,40 +524,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+@@ -532,40 +525,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
      return tableLog;
  }
  
@@ -12270,7 +18143,7 @@ index ec5b1ca6d71a..e46ca6621b48 100644
  /* fake FSE_CTable, for rle input (always same symbol) */
  size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
  {
-@@ -664,5 +622,4 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+@@ -664,5 +623,4 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
  
  size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
  
@@ -12305,7 +18178,7 @@ index fc1830abc9c6..f7687b0fc20a 100644
   *  You can contact the author at :
   *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
 diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c
-index 74ef0db47621..83241abafe35 100644
+index 74ef0db47621..0b229f5d2ae2 100644
 --- a/lib/zstd/compress/huf_compress.c
 +++ b/lib/zstd/compress/huf_compress.c
 @@ -1,6 +1,7 @@
@@ -12351,20 +18224,20 @@ index 74ef0db47621..83241abafe35 100644
 +#if DEBUGLEVEL >= 2
 +
 +static size_t showU32(const U32* arr, size_t size)
-+{
+ {
+-    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
 +    size_t u;
 +    for (u=0; u<size; u++) {
 +        RAWLOG(6, " %u", arr[u]); (void)arr;
 +    }
 +    RAWLOG(6, " \n");
 +    return size;
-+}
-+
+ }
+ 
 +static size_t HUF_getNbBits(HUF_CElt elt);
 +
 +static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
- {
--    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
++{
 +    size_t u;
 +    for (u=0; u<size; u++) {
 +        RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
@@ -12372,8 +18245,8 @@ index 74ef0db47621..83241abafe35 100644
 +    RAWLOG(6, " \n");
 +    return size;
 +
- }
- 
++}
++
 +static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
 +{
 +    size_t u;
@@ -12420,16 +18293,45 @@ index 74ef0db47621..83241abafe35 100644
  }
  
  static size_t HUF_getValueFast(HUF_CElt elt)
-@@ -175,6 +233,8 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+@@ -160,6 +218,25 @@ static void HUF_setValue(HUF_CElt* elt, size_t value)
+     }
+ }
+ 
++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable)
++{
++    HUF_CTableHeader header;
++    ZSTD_memcpy(&header, ctable, sizeof(header));
++    return header;
++}
++
++static void HUF_writeCTableHeader(HUF_CElt* ctable, U32 tableLog, U32 maxSymbolValue)
++{
++    HUF_CTableHeader header;
++    HUF_STATIC_ASSERT(sizeof(ctable[0]) == sizeof(header));
++    ZSTD_memset(&header, 0, sizeof(header));
++    assert(tableLog < 256);
++    header.tableLog = (BYTE)tableLog;
++    assert(maxSymbolValue < 256);
++    header.maxSymbolValue = (BYTE)maxSymbolValue;
++    ZSTD_memcpy(ctable, &header, sizeof(header));
++}
++
+ typedef struct {
+     HUF_CompressWeightsWksp wksp;
+     BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
+@@ -175,6 +252,11 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
      U32 n;
      HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
  
 +    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
++
++    assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue);
++    assert(HUF_readCTableHeader(CTable).tableLog == huffLog);
 +
      /* check conditions */
      if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
      if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
-@@ -204,16 +264,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
+@@ -204,16 +286,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
      return ((maxSymbolValue+1)/2) + 1;
  }
  
@@ -12446,13 +18348,32 @@ index 74ef0db47621..83241abafe35 100644
  
  size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
  {
-@@ -269,68 +319,64 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+@@ -231,7 +303,9 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+     if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+     if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
+ 
+-    CTable[0] = tableLog;
++    *maxSymbolValuePtr = nbSymbols - 1;
++
++    HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr);
+ 
+     /* Prepare base value per rank */
+     {   U32 n, nextRankStart = 0;
+@@ -263,74 +337,71 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
+         { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }
+     }
+ 
+-    *maxSymbolValuePtr = nbSymbols - 1;
+     return readSize;
+ }
  
  U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
  {
 -    const HUF_CElt* ct = CTable + 1;
 +    const HUF_CElt* const ct = CTable + 1;
      assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
++    if (symbolValue > HUF_readCTableHeader(CTable).maxSymbolValue)
++        return 0;
      return (U32)HUF_getNbBits(ct[symbolValue]);
  }
  
@@ -12542,7 +18463,7 @@ index 74ef0db47621..83241abafe35 100644
          assert(totalCost > 0);
  
          /* repay normalized cost */
-@@ -339,19 +385,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+@@ -339,19 +410,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
  
              /* Get pos of last (smallest = lowest cum. count) symbol per rank */
              ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
@@ -12566,7 +18487,7 @@ index 74ef0db47621..83241abafe35 100644
                  for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
                      U32 const highPos = rankLast[nBitsToDecrease];
                      U32 const lowPos = rankLast[nBitsToDecrease-1];
-@@ -391,7 +437,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+@@ -391,7 +462,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
                      rankLast[nBitsToDecrease] = noSymbol;
                  else {
                      rankLast[nBitsToDecrease]--;
@@ -12575,7 +18496,7 @@ index 74ef0db47621..83241abafe35 100644
                          rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
                  }
              }   /* while (totalCost > 0) */
-@@ -403,11 +449,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+@@ -403,11 +474,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
               * TODO.
               */
              while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
@@ -12590,7 +18511,7 @@ index 74ef0db47621..83241abafe35 100644
                      huffNode[n+1].nbBits--;
                      assert(n >= 0);
                      rankLast[1] = (U32)(n+1);
-@@ -421,7 +467,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+@@ -421,7 +492,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
          }   /* repay normalized cost */
      }   /* there are several too large elements (at least >= 2) */
  
@@ -12599,7 +18520,7 @@ index 74ef0db47621..83241abafe35 100644
  }
  
  typedef struct {
-@@ -429,7 +475,7 @@ typedef struct {
+@@ -429,7 +500,7 @@ typedef struct {
      U16 curr;
  } rankPos;
  
@@ -12608,7 +18529,7 @@ index 74ef0db47621..83241abafe35 100644
  
  /* Number of buckets available for HUF_sort() */
  #define RANK_POSITION_TABLE_SIZE 192
-@@ -448,8 +494,8 @@ typedef struct {
+@@ -448,8 +519,8 @@ typedef struct {
   * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
   */
  #define RANK_POSITION_MAX_COUNT_LOG 32
@@ -12619,7 +18540,7 @@ index 74ef0db47621..83241abafe35 100644
  
  /* Return the appropriate bucket index for a given count. See definition of
   * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
-@@ -457,7 +503,7 @@ typedef struct {
+@@ -457,7 +528,7 @@ typedef struct {
  static U32 HUF_getIndex(U32 const count) {
      return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
          ? count
@@ -12628,7 +18549,7 @@ index 74ef0db47621..83241abafe35 100644
  }
  
  /* Helper swap function for HUF_quickSortPartition() */
-@@ -580,7 +626,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+@@ -580,7 +651,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
  
      /* Sort each bucket. */
      for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
@@ -12637,7 +18558,7 @@ index 74ef0db47621..83241abafe35 100644
          U32 const bucketStartIdx = rankPosition[n].base;
          if (bucketSize > 1) {
              assert(bucketStartIdx < maxSymbolValue1);
-@@ -591,6 +637,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
+@@ -591,6 +662,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
      assert(HUF_isSorted(huffNode, maxSymbolValue1));
  }
  
@@ -12645,7 +18566,7 @@ index 74ef0db47621..83241abafe35 100644
  /* HUF_buildCTable_wksp() :
   *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
   *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
-@@ -611,6 +658,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+@@ -611,6 +683,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
      int lowS, lowN;
      int nodeNb = STARTNODE;
      int n, nodeRoot;
@@ -12653,7 +18574,7 @@ index 74ef0db47621..83241abafe35 100644
      /* init for parents */
      nonNullRank = (int)maxSymbolValue;
      while(huffNode[nonNullRank].count == 0) nonNullRank--;
-@@ -637,6 +685,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
+@@ -637,6 +710,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
      for (n=0; n<=nonNullRank; n++)
          huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
  
@@ -12662,8 +18583,13 @@ index 74ef0db47621..83241abafe35 100644
      return nonNullRank;
  }
  
-@@ -674,28 +724,36 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
-     CTable[0] = maxNbBits;
+@@ -671,31 +746,40 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
+         HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits);   /* push nbBits per symbol, symbol order */
+     for (n=0; n<alphabetSize; n++)
+         HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++);   /* assign value within rank, symbol order */
+-    CTable[0] = maxNbBits;
++
++    HUF_writeCTableHeader(CTable, maxNbBits, maxSymbolValue);
  }
  
 -size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
@@ -12704,7 +18630,35 @@ index 74ef0db47621..83241abafe35 100644
      maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
      if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
  
-@@ -804,7 +862,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
+@@ -716,13 +800,20 @@ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count,
+ }
+ 
+ int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+-  HUF_CElt const* ct = CTable + 1;
+-  int bad = 0;
+-  int s;
+-  for (s = 0; s <= (int)maxSymbolValue; ++s) {
+-    bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
+-  }
+-  return !bad;
++    HUF_CTableHeader header = HUF_readCTableHeader(CTable);
++    HUF_CElt const* ct = CTable + 1;
++    int bad = 0;
++    int s;
++
++    assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX);
++
++    if (header.maxSymbolValue < maxSymbolValue)
++        return 0;
++
++    for (s = 0; s <= (int)maxSymbolValue; ++s) {
++        bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
++    }
++    return !bad;
+ }
+ 
+ size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+@@ -804,7 +895,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
  #if DEBUGLEVEL >= 1
      {
          size_t const nbBits = HUF_getNbBits(elt);
@@ -12713,7 +18667,7 @@ index 74ef0db47621..83241abafe35 100644
          (void)dirtyBits;
          /* Middle bits are 0. */
          assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
-@@ -884,7 +942,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
+@@ -884,7 +975,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
      {
          size_t const nbBits = bitC->bitPos[0] & 0xFF;
          if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
@@ -12722,7 +18676,28 @@ index 74ef0db47621..83241abafe35 100644
      }
  }
  
-@@ -1045,9 +1103,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+@@ -964,17 +1055,17 @@ HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
+                                    const void* src, size_t srcSize,
+                                    const HUF_CElt* CTable)
+ {
+-    U32 const tableLog = (U32)CTable[0];
++    U32 const tableLog = HUF_readCTableHeader(CTable).tableLog;
+     HUF_CElt const* ct = CTable + 1;
+     const BYTE* ip = (const BYTE*) src;
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* const oend = ostart + dstSize;
+-    BYTE* op = ostart;
+     HUF_CStream_t bitC;
+ 
+     /* init */
+     if (dstSize < 8) return 0;   /* not enough space to compress */
+-    { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
++    { BYTE* op = ostart;
++      size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
+       if (HUF_isError(initErr)) return 0; }
+ 
+     if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
+@@ -1045,9 +1136,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
  static size_t
  HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
                                const void* src, size_t srcSize,
@@ -12734,7 +18709,7 @@ index 74ef0db47621..83241abafe35 100644
          return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
      }
      return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
-@@ -1058,28 +1116,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+@@ -1058,28 +1149,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
  static size_t
  HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
                                const void* src, size_t srcSize,
@@ -12749,13 +18724,13 @@ index 74ef0db47621..83241abafe35 100644
  #endif
  
 -size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
--{
++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
 -    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
 -}
 -
 -size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
-+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
- {
+-{
 -    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
 +    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
  }
@@ -12768,7 +18743,7 @@ index 74ef0db47621..83241abafe35 100644
  {
      size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
      const BYTE* ip = (const BYTE*) src;
-@@ -1093,7 +1146,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+@@ -1093,7 +1179,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
      op += 6;   /* jumpTable */
  
      assert(op <= oend);
@@ -12777,7 +18752,7 @@ index 74ef0db47621..83241abafe35 100644
          if (cSize == 0 || cSize > 65535) return 0;
          MEM_writeLE16(ostart, (U16)cSize);
          op += cSize;
-@@ -1101,7 +1154,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+@@ -1101,7 +1187,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
  
      ip += segmentSize;
      assert(op <= oend);
@@ -12786,7 +18761,7 @@ index 74ef0db47621..83241abafe35 100644
          if (cSize == 0 || cSize > 65535) return 0;
          MEM_writeLE16(ostart+2, (U16)cSize);
          op += cSize;
-@@ -1109,7 +1162,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+@@ -1109,7 +1195,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
  
      ip += segmentSize;
      assert(op <= oend);
@@ -12795,7 +18770,7 @@ index 74ef0db47621..83241abafe35 100644
          if (cSize == 0 || cSize > 65535) return 0;
          MEM_writeLE16(ostart+4, (U16)cSize);
          op += cSize;
-@@ -1118,7 +1171,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+@@ -1118,7 +1204,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
      ip += segmentSize;
      assert(op <= oend);
      assert(ip <= iend);
@@ -12804,24 +18779,24 @@ index 74ef0db47621..83241abafe35 100644
          if (cSize == 0 || cSize > 65535) return 0;
          op += cSize;
      }
-@@ -1126,14 +1179,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+@@ -1126,14 +1212,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
      return (size_t)(op-ostart);
  }
  
 -size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
-+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
- {
+-{
 -    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
 -}
 -
 -size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
--{
++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
+ {
 -    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
 +    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
  }
  
  typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
-@@ -1141,11 +1189,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+@@ -1141,11 +1222,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
  static size_t HUF_compressCTable_internal(
                  BYTE* const ostart, BYTE* op, BYTE* const oend,
                  const void* src, size_t srcSize,
@@ -12836,7 +18811,7 @@ index 74ef0db47621..83241abafe35 100644
      if (HUF_isError(cSize)) { return cSize; }
      if (cSize==0) { return 0; }   /* uncompressible */
      op += cSize;
-@@ -1168,6 +1216,79 @@ typedef struct {
+@@ -1168,6 +1249,81 @@ typedef struct {
  #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
  #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */
  
@@ -12877,7 +18852,7 @@ index 74ef0db47621..83241abafe35 100644
 +
 +    {   BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
 +        size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
-+        size_t maxBits, hSize, newSize;
++        size_t hSize, newSize;
 +        const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
 +        const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
 +        size_t optSize = ((size_t) ~0) - 1;
@@ -12888,12 +18863,14 @@ index 74ef0db47621..83241abafe35 100644
 +        /* Search until size increases */
 +        for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
 +            DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
-+            maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
-+            if (ERR_isError(maxBits)) continue;
 +
-+            if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
++            {   size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
++                if (ERR_isError(maxBits)) continue;
 +
-+            hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
++                if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
++
++                hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
++            }
 +
 +            if (ERR_isError(hSize)) continue;
 +
@@ -12916,7 +18893,7 @@ index 74ef0db47621..83241abafe35 100644
  /* HUF_compress_internal() :
   * `workSpace_align4` must be aligned on 4-bytes boundaries,
   * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
-@@ -1177,14 +1298,14 @@ HUF_compress_internal (void* dst, size_t dstSize,
+@@ -1177,14 +1333,14 @@ HUF_compress_internal (void* dst, size_t dstSize,
                         unsigned maxSymbolValue, unsigned huffLog,
                         HUF_nbStreams_e nbStreams,
                         void* workSpace, size_t wkspSize,
@@ -12933,7 +18910,7 @@ index 74ef0db47621..83241abafe35 100644
      HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
  
      /* checks & inits */
-@@ -1198,16 +1319,17 @@ HUF_compress_internal (void* dst, size_t dstSize,
+@@ -1198,16 +1354,17 @@ HUF_compress_internal (void* dst, size_t dstSize,
      if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
  
      /* Heuristic : If old table is valid, use it for small inputs */
@@ -12954,7 +18931,7 @@ index 74ef0db47621..83241abafe35 100644
          {   unsigned maxSymbolValueBegin = maxSymbolValue;
              CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
              largestTotal += largestBegin;
-@@ -1224,6 +1346,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+@@ -1224,6 +1381,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
          if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
          if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
      }
@@ -12962,7 +18939,7 @@ index 74ef0db47621..83241abafe35 100644
  
      /* Check validity of previous table */
      if ( repeat
-@@ -1232,19 +1355,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
+@@ -1232,25 +1390,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
          *repeat = HUF_repeat_none;
      }
      /* Heuristic : use existing table for small inputs */
@@ -12982,11 +18959,17 @@ index 74ef0db47621..83241abafe35 100644
                                              &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
          CHECK_F(maxBits);
          huffLog = (U32)maxBits;
+-    }
+-    /* Zero unused symbols in CTable, so we can check it for validity */
+-    {
+-        size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
+-        size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
+-        ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
 +        DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
      }
-     /* Zero unused symbols in CTable, so we can check it for validity */
-     {
-@@ -1263,7 +1387,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
+ 
+     /* Write table description header */
+@@ -1263,7 +1416,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
              if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
                  return HUF_compressCTable_internal(ostart, op, oend,
                                                     src, srcSize,
@@ -12995,7 +18978,7 @@ index 74ef0db47621..83241abafe35 100644
          }   }
  
          /* Use the new huffman table */
-@@ -1275,46 +1399,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
+@@ -1275,61 +1428,35 @@ HUF_compress_internal (void* dst, size_t dstSize,
      }
      return HUF_compressCTable_internal(ostart, op, oend,
                                         src, srcSize,
@@ -13046,7 +19029,11 @@ index 74ef0db47621..83241abafe35 100644
  }
  
  /* HUF_compress4X_repeat():
-@@ -1325,11 +1423,11 @@ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+  * compress input using 4 streams.
+  * consider skipping quickly
+- * re-use an existing huffman compression table */
++ * reuse an existing huffman compression table */
+ size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
                        const void* src, size_t srcSize,
                        unsigned maxSymbolValue, unsigned huffLog,
                        void* workSpace, size_t wkspSize,
@@ -13062,7 +19049,7 @@ index 74ef0db47621..83241abafe35 100644
  }
 -
 diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
-index f620cafca633..c1c316e9e289 100644
+index f620cafca633..0d139727cd39 100644
 --- a/lib/zstd/compress/zstd_compress.c
 +++ b/lib/zstd/compress/zstd_compress.c
 @@ -1,5 +1,6 @@
@@ -13119,7 +19106,11 @@ index f620cafca633..c1c316e9e289 100644
  }
  
  
-@@ -171,12 +176,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+@@ -168,15 +173,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx)
+ 
+ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+ {
++    DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx);
      if (cctx==NULL) return 0;   /* support free on NULL */
      RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
                      "not compatible with static CCtx");
@@ -13134,7 +19125,7 @@ index f620cafca633..c1c316e9e289 100644
      }
      return 0;
  }
-@@ -257,9 +259,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+@@ -257,9 +260,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
      return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
  }
  
@@ -13146,7 +19137,7 @@ index f620cafca633..c1c316e9e289 100644
   */
  static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
                                   const ZSTD_compressionParameters* const cParams) {
-@@ -267,6 +269,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+@@ -267,6 +270,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
      return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
  }
  
@@ -13181,7 +19172,7 @@ index f620cafca633..c1c316e9e289 100644
  static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
          ZSTD_compressionParameters cParams)
  {
-@@ -284,6 +314,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+@@ -284,6 +315,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
      }
      cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
      cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
@@ -13192,7 +19183,7 @@ index f620cafca633..c1c316e9e289 100644
      assert(!ZSTD_checkCParams(cParams));
      return cctxParams;
  }
-@@ -329,10 +363,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
+@@ -329,10 +364,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
  #define ZSTD_NO_CLEVEL 0
  
  /*
@@ -13208,7 +19199,7 @@ index f620cafca633..c1c316e9e289 100644
  {
      assert(!ZSTD_checkCParams(params->cParams));
      ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
-@@ -345,6 +382,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
+@@ -345,6 +383,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
      cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
      cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
      cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
@@ -13218,7 +19209,7 @@ index f620cafca633..c1c316e9e289 100644
      DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
                  cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
  }
-@@ -359,7 +399,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
+@@ -359,7 +400,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
  
  /*
   * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
@@ -13227,7 +19218,7 @@ index f620cafca633..c1c316e9e289 100644
   */
  static void ZSTD_CCtxParams_setZstdParams(
          ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
-@@ -455,8 +495,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+@@ -455,8 +496,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
          return bounds;
  
      case ZSTD_c_enableLongDistanceMatching:
@@ -13238,7 +19229,7 @@ index f620cafca633..c1c316e9e289 100644
          return bounds;
  
      case ZSTD_c_ldmHashLog:
-@@ -549,6 +589,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+@@ -549,6 +590,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
          bounds.upperBound = 1;
          return bounds;
  
@@ -13265,7 +19256,23 @@ index f620cafca633..c1c316e9e289 100644
      default:
          bounds.error = ERROR(parameter_unsupported);
          return bounds;
-@@ -613,6 +673,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+@@ -567,10 +628,11 @@ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value)
+     return 0;
+ }
+ 
+-#define BOUNDCHECK(cParam, val) { \
+-    RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \
+-                    parameter_outOfBound, "Param out of bounds"); \
+-}
++#define BOUNDCHECK(cParam, val)                                       \
++    do {                                                              \
++        RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val),        \
++                        parameter_outOfBound, "Param out of bounds"); \
++    } while (0)
+ 
+ 
+ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+@@ -613,6 +675,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
      case ZSTD_c_useBlockSplitter:
      case ZSTD_c_useRowMatchFinder:
      case ZSTD_c_deterministicRefPrefix:
@@ -13276,7 +19283,7 @@ index f620cafca633..c1c316e9e289 100644
      default:
          return 0;
      }
-@@ -625,7 +689,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+@@ -625,7 +691,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
          if (ZSTD_isUpdateAuthorized(param)) {
              cctx->cParamsChanged = 1;
          } else {
@@ -13285,7 +19292,7 @@ index f620cafca633..c1c316e9e289 100644
      }   }
  
      switch(param)
-@@ -668,6 +732,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+@@ -668,6 +734,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
      case ZSTD_c_useBlockSplitter:
      case ZSTD_c_useRowMatchFinder:
      case ZSTD_c_deterministicRefPrefix:
@@ -13296,7 +19303,7 @@ index f620cafca633..c1c316e9e289 100644
          break;
  
      default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
-@@ -723,12 +791,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+@@ -723,12 +793,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
      case ZSTD_c_minMatch :
          if (value!=0)   /* 0 => use default */
              BOUNDCHECK(ZSTD_c_minMatch, value);
@@ -13311,7 +19318,7 @@ index f620cafca633..c1c316e9e289 100644
          return CCtxParams->cParams.targetLength;
  
      case ZSTD_c_strategy :
-@@ -741,12 +809,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+@@ -741,12 +811,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
          /* Content size written in frame header _when known_ (default:1) */
          DEBUGLOG(4, "set content size flag = %u", (value!=0));
          CCtxParams->fParams.contentSizeFlag = value != 0;
@@ -13326,7 +19333,7 @@ index f620cafca633..c1c316e9e289 100644
  
      case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
          DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
-@@ -755,18 +823,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+@@ -755,18 +825,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
  
      case ZSTD_c_forceMaxWindow :
          CCtxParams->forceWindow = (value != 0);
@@ -13348,7 +19355,7 @@ index f620cafca633..c1c316e9e289 100644
          CCtxParams->literalCompressionMode = lcm;
          return CCtxParams->literalCompressionMode;
      }
-@@ -789,47 +857,48 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+@@ -789,47 +859,50 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
  
      case ZSTD_c_enableDedicatedDictSearch :
          CCtxParams->enableDedicatedDictSearch = (value!=0);
@@ -13389,9 +19396,12 @@ index f620cafca633..c1c316e9e289 100644
          return CCtxParams->ldmParams.hashRateLog;
  
      case ZSTD_c_targetCBlockSize :
-         if (value!=0)   /* 0 ==> default */
+-        if (value!=0)   /* 0 ==> default */
++        if (value!=0) {  /* 0 ==> default */
++            value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN);
              BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
 -        CCtxParams->targetCBlockSize = value;
++        }
 +        CCtxParams->targetCBlockSize = (U32)value;
          return CCtxParams->targetCBlockSize;
  
@@ -13404,10 +19414,22 @@ index f620cafca633..c1c316e9e289 100644
  
      case ZSTD_c_stableInBuffer:
          BOUNDCHECK(ZSTD_c_stableInBuffer, value);
-@@ -866,6 +935,27 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
-         CCtxParams->deterministicRefPrefix = !!value;
-         return CCtxParams->deterministicRefPrefix;
+@@ -849,7 +922,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+     case ZSTD_c_validateSequences:
+         BOUNDCHECK(ZSTD_c_validateSequences, value);
+         CCtxParams->validateSequences = value;
+-        return CCtxParams->validateSequences;
++        return (size_t)CCtxParams->validateSequences;
  
+     case ZSTD_c_useBlockSplitter:
+         BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
+@@ -864,7 +937,28 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+     case ZSTD_c_deterministicRefPrefix:
+         BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value);
+         CCtxParams->deterministicRefPrefix = !!value;
+-        return CCtxParams->deterministicRefPrefix;
++        return (size_t)CCtxParams->deterministicRefPrefix;
++
 +    case ZSTD_c_prefetchCDictTables:
 +        BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
 +        CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
@@ -13416,7 +19438,7 @@ index f620cafca633..c1c316e9e289 100644
 +    case ZSTD_c_enableSeqProducerFallback:
 +        BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
 +        CCtxParams->enableMatchFinderFallback = value;
-+        return CCtxParams->enableMatchFinderFallback;
++        return (size_t)CCtxParams->enableMatchFinderFallback;
 +
 +    case ZSTD_c_maxBlockSize:
 +        if (value!=0)    /* 0 ==> default */
@@ -13428,11 +19450,10 @@ index f620cafca633..c1c316e9e289 100644
 +        BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
 +        CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
 +        return CCtxParams->searchForExternalRepcodes;
-+
+ 
      default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
      }
- }
-@@ -980,6 +1070,18 @@ size_t ZSTD_CCtxParams_getParameter(
+@@ -980,6 +1074,18 @@ size_t ZSTD_CCtxParams_getParameter(
      case ZSTD_c_deterministicRefPrefix:
          *value = (int)CCtxParams->deterministicRefPrefix;
          break;
@@ -13451,7 +19472,7 @@ index f620cafca633..c1c316e9e289 100644
      default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
      }
      return 0;
-@@ -1006,9 +1108,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+@@ -1006,9 +1112,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
      return 0;
  }
  
@@ -13500,7 +19521,7 @@ index f620cafca633..c1c316e9e289 100644
      RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
                      "Can't set pledgedSrcSize when not in init stage.");
      cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
-@@ -1024,9 +1164,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
+@@ -1024,9 +1168,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
          ZSTD_compressionParameters* cParams);
  
  /*
@@ -13513,7 +19534,7 @@ index f620cafca633..c1c316e9e289 100644
   */
  static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
  {
-@@ -1039,8 +1179,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+@@ -1039,8 +1183,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
          return 0;
      }
      if (dl->cdict != NULL) {
@@ -13523,7 +19544,7 @@ index f620cafca633..c1c316e9e289 100644
          return 0;
      }
      assert(dl->dictSize > 0);
-@@ -1060,26 +1200,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+@@ -1060,26 +1204,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
  }
  
  size_t ZSTD_CCtx_loadDictionary_advanced(
@@ -13564,18 +19585,34 @@ index f620cafca633..c1c316e9e289 100644
      }
      cctx->localDict.dictSize = dictSize;
      cctx->localDict.dictContentType = dictContentType;
-@@ -1149,8 +1293,9 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
+@@ -1149,7 +1297,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
      if ( (reset == ZSTD_reset_parameters)
        || (reset == ZSTD_reset_session_and_parameters) ) {
          RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
 -                        "Can't reset parameters only when not in init stage.");
 +                        "Reset parameters is only possible during init stage.");
          ZSTD_clearAllDicts(cctx);
-+        ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx));
          return ZSTD_CCtxParams_reset(&cctx->requestedParams);
      }
-     return 0;
-@@ -1247,7 +1392,8 @@ static ZSTD_compressionParameters
+@@ -1178,11 +1326,12 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+ static ZSTD_compressionParameters
+ ZSTD_clampCParams(ZSTD_compressionParameters cParams)
+ {
+-#   define CLAMP_TYPE(cParam, val, type) {                                \
+-        ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
+-        if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
+-        else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
+-    }
++#   define CLAMP_TYPE(cParam, val, type)                                      \
++        do {                                                                  \
++            ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
++            if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
++            else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
++        } while (0)
+ #   define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned)
+     CLAMP(ZSTD_c_windowLog, cParams.windowLog);
+     CLAMP(ZSTD_c_chainLog,  cParams.chainLog);
+@@ -1247,12 +1396,55 @@ static ZSTD_compressionParameters
  ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
                              unsigned long long srcSize,
                              size_t dictSize,
@@ -13585,7 +19622,54 @@ index f620cafca633..c1c316e9e289 100644
  {
      const U64 minSrcSize = 513; /* (1<<9) + 1 */
      const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
-@@ -1281,8 +1427,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+     assert(ZSTD_checkCParams(cPar)==0);
+ 
++    /* Cascade the selected strategy down to the next-highest one built into
++     * this binary. */
++#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btultra2) {
++        cPar.strategy = ZSTD_btultra;
++    }
++    if (cPar.strategy == ZSTD_btultra) {
++        cPar.strategy = ZSTD_btopt;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btopt) {
++        cPar.strategy = ZSTD_btlazy2;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_btlazy2) {
++        cPar.strategy = ZSTD_lazy2;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_lazy2) {
++        cPar.strategy = ZSTD_lazy;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_lazy) {
++        cPar.strategy = ZSTD_greedy;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_greedy) {
++        cPar.strategy = ZSTD_dfast;
++    }
++#endif
++#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++    if (cPar.strategy == ZSTD_dfast) {
++        cPar.strategy = ZSTD_fast;
++        cPar.targetLength = 0;
++    }
++#endif
++
+     switch (mode) {
+     case ZSTD_cpm_unknown:
+     case ZSTD_cpm_noAttachDict:
+@@ -1281,8 +1473,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
      }
  
      /* resize windowLog if input is small enough, to use less memory */
@@ -13596,7 +19680,7 @@ index f620cafca633..c1c316e9e289 100644
          U32 const tSize = (U32)(srcSize + dictSize);
          static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
          U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
-@@ -1300,6 +1446,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+@@ -1300,6 +1492,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
      if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
          cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
  
@@ -13639,7 +19723,7 @@ index f620cafca633..c1c316e9e289 100644
      return cPar;
  }
  
-@@ -1310,7 +1492,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+@@ -1310,7 +1538,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
  {
      cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
      if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
@@ -13648,7 +19732,7 @@ index f620cafca633..c1c316e9e289 100644
  }
  
  static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
-@@ -1341,7 +1523,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+@@ -1341,7 +1569,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
      ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
      assert(!ZSTD_checkCParams(cParams));
      /* srcSizeHint == 0 means 0 */
@@ -13657,16 +19741,21 @@ index f620cafca633..c1c316e9e289 100644
  }
  
  static size_t
-@@ -1370,7 +1552,7 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
-       + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
-       + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+@@ -1367,10 +1595,10 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+       + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32))
+       + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32))
+       + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32))
+-      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
+-      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
++      + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_match_t))
++      + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
      size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
 -                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
 +                                            ? ZSTD_cwksp_aligned_alloc_size(hSize)
                                              : 0;
      size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
                                  ? optPotentialSpace
-@@ -1386,6 +1568,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+@@ -1386,6 +1614,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
      return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
  }
  
@@ -13680,7 +19769,7 @@ index f620cafca633..c1c316e9e289 100644
  static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
          const ZSTD_compressionParameters* cParams,
          const ldmParams_t* ldmParams,
-@@ -1393,12 +1582,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+@@ -1393,12 +1628,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
          const ZSTD_paramSwitch_e useRowMatchFinder,
          const size_t buffInSize,
          const size_t buffOutSize,
@@ -13698,7 +19787,7 @@ index f620cafca633..c1c316e9e289 100644
      size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
                              + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
                              + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
-@@ -1417,6 +1607,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+@@ -1417,6 +1653,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
  
      size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
  
@@ -13710,7 +19799,7 @@ index f620cafca633..c1c316e9e289 100644
      size_t const neededSpace =
          cctxSpace +
          entropySpace +
-@@ -1425,7 +1620,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
+@@ -1425,7 +1666,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
          ldmSeqSpace +
          matchStateSize +
          tokenSpace +
@@ -13720,16 +19809,16 @@ index f620cafca633..c1c316e9e289 100644
  
      DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
      return neededSpace;
-@@ -1443,7 +1639,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+@@ -1443,7 +1685,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
       * be needed. However, we still allocate two 0-sized buffers, which can
       * take space under ASAN. */
      return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
 -        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
-+        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
++        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
  }
  
  size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
-@@ -1493,7 +1689,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+@@ -1493,7 +1735,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
      RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
      {   ZSTD_compressionParameters const cParams =
                  ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
@@ -13738,16 +19827,16 @@ index f620cafca633..c1c316e9e289 100644
          size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
                  ? ((size_t)1 << cParams.windowLog) + blockSize
                  : 0;
-@@ -1504,7 +1700,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+@@ -1504,7 +1746,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
  
          return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
              &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
 -            ZSTD_CONTENTSIZE_UNKNOWN);
-+            ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize);
++            ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
      }
  }
  
-@@ -1637,6 +1833,19 @@ typedef enum {
+@@ -1637,6 +1879,19 @@ typedef enum {
      ZSTD_resetTarget_CCtx
  } ZSTD_resetTarget_e;
  
@@ -13767,7 +19856,7 @@ index f620cafca633..c1c316e9e289 100644
  
  static size_t
  ZSTD_reset_matchState(ZSTD_matchState_t* ms,
-@@ -1664,6 +1873,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+@@ -1664,6 +1919,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
      }
  
      ms->hashLog3 = hashLog3;
@@ -13775,11 +19864,26 @@ index f620cafca633..c1c316e9e289 100644
  
      ZSTD_invalidateMatchState(ms);
  
-@@ -1685,6 +1895,27 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+@@ -1685,22 +1941,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
          ZSTD_cwksp_clean_tables(ws);
      }
  
-+    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+-    /* opt parser space */
+-    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
+-        DEBUGLOG(4, "reserving optimal parser space");
+-        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
+-        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
+-        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
+-        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
+-        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t));
+-        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+-    }
+-
+     if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+-        {   /* Row match finder needs an additional table of hashes ("tags") */
+-            size_t const tagTableSize = hSize*sizeof(U16);
+-            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+-            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
 +        /* Row match finder needs an additional table of hashes ("tags") */
 +        size_t const tagTableSize = hSize;
 +        /* We want to generate a new salt in case we reset a Cctx, but we always want to use
@@ -13792,38 +19896,28 @@ index f620cafca633..c1c316e9e289 100644
 +            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
 +            ZSTD_memset(ms->tagTable, 0, tagTableSize);
 +            ms->hashSalt = 0;
-+        }
-+        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
-+            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
-+            assert(cParams->hashLog >= rowLog);
-+            ms->rowHashLog = cParams->hashLog - rowLog;
-+        }
-+    }
-+
-     /* opt parser space */
-     if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
-         DEBUGLOG(4, "reserving optimal parser space");
-@@ -1696,19 +1927,6 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
-         ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+         }
+         {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+             U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
+@@ -1709,6 +1962,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+         }
      }
  
--    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
--        {   /* Row match finder needs an additional table of hashes ("tags") */
--            size_t const tagTableSize = hSize*sizeof(U16);
--            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
--            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
--        }
--        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
--            U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
--            assert(cParams->hashLog >= rowLog);
--            ms->rowHashLog = cParams->hashLog - rowLog;
--        }
--    }
--
++    /* opt parser space */
++    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
++        DEBUGLOG(4, "reserving optimal parser space");
++        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
++        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
++        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
++        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
++        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t));
++        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
++    }
++
      ms->cParams = *cParams;
  
      RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
-@@ -1768,6 +1986,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+@@ -1768,6 +2032,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
      assert(params->useRowMatchFinder != ZSTD_ps_auto);
      assert(params->useBlockSplitter != ZSTD_ps_auto);
      assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
@@ -13831,7 +19925,7 @@ index f620cafca633..c1c316e9e289 100644
      if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
          /* Adjust long distance matching parameters */
          ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
-@@ -1776,9 +1995,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+@@ -1776,9 +2041,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
      }
  
      {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
@@ -13839,20 +19933,30 @@ index f620cafca633..c1c316e9e289 100644
 -        U32    const divider = (params->cParams.minMatch==3) ? 3 : 4;
 -        size_t const maxNbSeq = blockSize / divider;
 +        size_t const blockSize = MIN(params->maxBlockSize, windowSize);
-+        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useSequenceProducer);
++        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params));
          size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
                  ? ZSTD_compressBound(blockSize) + 1
                  : 0;
-@@ -1795,7 +2013,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+@@ -1795,8 +2059,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
          size_t const neededSpace =
              ZSTD_estimateCCtxSize_usingCCtxParams_internal(
                  &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
 -                buffInSize, buffOutSize, pledgedSrcSize);
-+                buffInSize, buffOutSize, pledgedSrcSize, params->useSequenceProducer, params->maxBlockSize);
-         int resizeWorkspace;
+-        int resizeWorkspace;
++                buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
  
          FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
-@@ -1838,6 +2056,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+ 
+@@ -1805,7 +2068,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+         {   /* Check if workspace is large enough, alloc a new one if needed */
+             int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace;
+             int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace);
+-            resizeWorkspace = workspaceTooSmall || workspaceWasteful;
++            int resizeWorkspace = workspaceTooSmall || workspaceWasteful;
+             DEBUGLOG(4, "Need %zu B workspace", neededSpace);
+             DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
+ 
+@@ -1838,6 +2101,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
  
          /* init params */
          zc->blockState.matchState.cParams = params->cParams;
@@ -13860,7 +19964,7 @@ index f620cafca633..c1c316e9e289 100644
          zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
          zc->consumedSrcSize = 0;
          zc->producedCSize = 0;
-@@ -1854,13 +2073,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+@@ -1854,13 +2118,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
  
          ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
  
@@ -13889,10 +19993,10 @@ index f620cafca633..c1c316e9e289 100644
 +        }
 +
 +        /* reserve space for block-level external sequences */
-+        if (params->useSequenceProducer) {
++        if (ZSTD_hasExtSeqProd(params)) {
 +            size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
-+            zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq;
-+            zc->externalMatchCtx.seqBuffer =
++            zc->extSeqBufCapacity = maxNbExternalSeq;
++            zc->extSeqBuf =
 +                (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
 +        }
 +
@@ -13908,7 +20012,7 @@ index f620cafca633..c1c316e9e289 100644
          zc->bufferedPolicy = zbuff;
          zc->inBuffSize = buffInSize;
          zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
-@@ -1883,32 +2135,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+@@ -1883,32 +2180,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
          zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
          zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
          zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
@@ -13942,7 +20046,7 @@ index f620cafca633..c1c316e9e289 100644
  
          zc->initialized = 1;
  
-@@ -1980,7 +2209,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+@@ -1980,7 +2254,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
          }
  
          params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
@@ -13952,7 +20056,7 @@ index f620cafca633..c1c316e9e289 100644
          params.cParams.windowLog = windowLog;
          params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
          FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
-@@ -2019,6 +2249,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+@@ -2019,6 +2294,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
      return 0;
  }
  
@@ -13975,7 +20079,7 @@ index f620cafca633..c1c316e9e289 100644
  static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                              const ZSTD_CDict* cdict,
                              ZSTD_CCtx_params params,
-@@ -2054,21 +2300,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+@@ -2054,21 +2345,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                                                              : 0;
          size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
  
@@ -14008,7 +20112,7 @@ index f620cafca633..c1c316e9e289 100644
          }
      }
  
-@@ -2147,6 +2395,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+@@ -2147,6 +2440,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
          params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
          params.ldmParams = srcCCtx->appliedParams.ldmParams;
          params.fParams = fParams;
@@ -14016,7 +20120,7 @@ index f620cafca633..c1c316e9e289 100644
          ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
                                  /* loadedDictSize */ 0,
                                  ZSTDcrp_leaveDirty, zbuff);
-@@ -2294,7 +2543,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
+@@ -2294,7 +2588,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
  
  /* See doc/zstd_compression_format.md for detailed format description */
  
@@ -14025,7 +20129,7 @@ index f620cafca633..c1c316e9e289 100644
  {
      const seqDef* const sequences = seqStorePtr->sequencesStart;
      BYTE* const llCodeTable = seqStorePtr->llCode;
-@@ -2302,18 +2551,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+@@ -2302,18 +2596,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
      BYTE* const mlCodeTable = seqStorePtr->mlCode;
      U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
      U32 u;
@@ -14051,7 +20155,7 @@ index f620cafca633..c1c316e9e289 100644
  }
  
  /* ZSTD_useTargetCBlockSize():
-@@ -2347,6 +2602,7 @@ typedef struct {
+@@ -2347,6 +2647,7 @@ typedef struct {
      U32 MLtype;
      size_t size;
      size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
@@ -14059,7 +20163,7 @@ index f620cafca633..c1c316e9e289 100644
  } ZSTD_symbolEncodingTypeStats_t;
  
  /* ZSTD_buildSequencesStatistics():
-@@ -2357,11 +2613,13 @@ typedef struct {
+@@ -2357,11 +2658,13 @@ typedef struct {
   * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
   */
  static ZSTD_symbolEncodingTypeStats_t
@@ -14078,7 +20182,7 @@ index f620cafca633..c1c316e9e289 100644
      BYTE* const ostart = dst;
      const BYTE* const oend = dstEnd;
      BYTE* op = ostart;
-@@ -2375,7 +2633,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+@@ -2375,7 +2678,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
  
      stats.lastCountSize = 0;
      /* convert length/distances into codes */
@@ -14087,7 +20191,7 @@ index f620cafca633..c1c316e9e289 100644
      assert(op <= oend);
      assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
      /* build CTable for Literal Lengths */
-@@ -2480,22 +2738,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+@@ -2480,22 +2783,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
   */
  #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
  MEM_STATIC size_t
@@ -14119,7 +20223,7 @@ index f620cafca633..c1c316e9e289 100644
      const BYTE* const ofCodeTable = seqStorePtr->ofCode;
      const BYTE* const llCodeTable = seqStorePtr->llCode;
      const BYTE* const mlCodeTable = seqStorePtr->mlCode;
-@@ -2503,29 +2761,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+@@ -2503,29 +2806,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
      BYTE* const oend = ostart + dstCapacity;
      BYTE* op = ostart;
      size_t lastCountSize;
@@ -14158,7 +20262,7 @@ index f620cafca633..c1c316e9e289 100644
          FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
          assert(cSize <= dstCapacity);
          op += cSize;
-@@ -2551,11 +2811,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+@@ -2551,11 +2856,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
          ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
          return (size_t)(op - ostart);
      }
@@ -14173,7 +20277,7 @@ index f620cafca633..c1c316e9e289 100644
                                               &prevEntropy->fse, &nextEntropy->fse,
                                                op, oend,
                                                strategy, count,
-@@ -2564,6 +2823,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+@@ -2564,6 +2868,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
          *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
          lastCountSize = stats.lastCountSize;
          op += stats.size;
@@ -14181,7 +20285,7 @@ index f620cafca633..c1c316e9e289 100644
      }
  
      {   size_t const bitstreamSize = ZSTD_encodeSequences(
-@@ -2598,14 +2858,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
+@@ -2598,14 +2903,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
  }
  
  MEM_STATIC size_t
@@ -14205,7 +20309,7 @@ index f620cafca633..c1c316e9e289 100644
  {
      size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
                              seqStorePtr, prevEntropy, nextEntropy, cctxParams,
-@@ -2615,15 +2876,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
+@@ -2615,15 +2921,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
      /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
       * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
       */
@@ -14229,7 +20333,117 @@ index f620cafca633..c1c316e9e289 100644
      return cSize;
  }
  
-@@ -2718,6 +2985,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+@@ -2635,40 +2947,43 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS
+     static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {
+         { ZSTD_compressBlock_fast  /* default for 0 */,
+           ZSTD_compressBlock_fast,
+-          ZSTD_compressBlock_doubleFast,
+-          ZSTD_compressBlock_greedy,
+-          ZSTD_compressBlock_lazy,
+-          ZSTD_compressBlock_lazy2,
+-          ZSTD_compressBlock_btlazy2,
+-          ZSTD_compressBlock_btopt,
+-          ZSTD_compressBlock_btultra,
+-          ZSTD_compressBlock_btultra2 },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST,
++          ZSTD_COMPRESSBLOCK_GREEDY,
++          ZSTD_COMPRESSBLOCK_LAZY,
++          ZSTD_COMPRESSBLOCK_LAZY2,
++          ZSTD_COMPRESSBLOCK_BTLAZY2,
++          ZSTD_COMPRESSBLOCK_BTOPT,
++          ZSTD_COMPRESSBLOCK_BTULTRA,
++          ZSTD_COMPRESSBLOCK_BTULTRA2
++        },
+         { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
+           ZSTD_compressBlock_fast_extDict,
+-          ZSTD_compressBlock_doubleFast_extDict,
+-          ZSTD_compressBlock_greedy_extDict,
+-          ZSTD_compressBlock_lazy_extDict,
+-          ZSTD_compressBlock_lazy2_extDict,
+-          ZSTD_compressBlock_btlazy2_extDict,
+-          ZSTD_compressBlock_btopt_extDict,
+-          ZSTD_compressBlock_btultra_extDict,
+-          ZSTD_compressBlock_btultra_extDict },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT,
++          ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT,
++          ZSTD_COMPRESSBLOCK_LAZY_EXTDICT,
++          ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT,
++          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT
++        },
+         { ZSTD_compressBlock_fast_dictMatchState  /* default for 0 */,
+           ZSTD_compressBlock_fast_dictMatchState,
+-          ZSTD_compressBlock_doubleFast_dictMatchState,
+-          ZSTD_compressBlock_greedy_dictMatchState,
+-          ZSTD_compressBlock_lazy_dictMatchState,
+-          ZSTD_compressBlock_lazy2_dictMatchState,
+-          ZSTD_compressBlock_btlazy2_dictMatchState,
+-          ZSTD_compressBlock_btopt_dictMatchState,
+-          ZSTD_compressBlock_btultra_dictMatchState,
+-          ZSTD_compressBlock_btultra_dictMatchState },
++          ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE,
++          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE
++        },
+         { NULL  /* default for 0 */,
+           NULL,
+           NULL,
+-          ZSTD_compressBlock_greedy_dedicatedDictSearch,
+-          ZSTD_compressBlock_lazy_dedicatedDictSearch,
+-          ZSTD_compressBlock_lazy2_dedicatedDictSearch,
++          ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH,
++          ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH,
++          ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH,
+           NULL,
+           NULL,
+           NULL,
+@@ -2681,18 +2996,26 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS
+     DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
+     if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) {
+         static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = {
+-            { ZSTD_compressBlock_greedy_row,
+-            ZSTD_compressBlock_lazy_row,
+-            ZSTD_compressBlock_lazy2_row },
+-            { ZSTD_compressBlock_greedy_extDict_row,
+-            ZSTD_compressBlock_lazy_extDict_row,
+-            ZSTD_compressBlock_lazy2_extDict_row },
+-            { ZSTD_compressBlock_greedy_dictMatchState_row,
+-            ZSTD_compressBlock_lazy_dictMatchState_row,
+-            ZSTD_compressBlock_lazy2_dictMatchState_row },
+-            { ZSTD_compressBlock_greedy_dedicatedDictSearch_row,
+-            ZSTD_compressBlock_lazy_dedicatedDictSearch_row,
+-            ZSTD_compressBlock_lazy2_dedicatedDictSearch_row }
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW
++            },
++            {
++                ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW,
++                ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW
++            }
+         };
+         DEBUGLOG(4, "Selecting a row-based matchfinder");
+         assert(useRowMatchFinder != ZSTD_ps_auto);
+@@ -2718,6 +3041,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
      ssPtr->longLengthType = ZSTD_llt_none;
  }
  
@@ -14302,7 +20516,7 @@ index f620cafca633..c1c316e9e289 100644
  typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
  
  static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
-@@ -2727,7 +3060,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+@@ -2727,7 +3116,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
      assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
      /* Assert that we have correctly flushed the ctx params into the ms's copy */
      ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
@@ -14313,7 +20527,7 @@ index f620cafca633..c1c316e9e289 100644
          if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
              ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
          } else {
-@@ -2763,6 +3098,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+@@ -2763,6 +3154,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
          }
          if (zc->externSeqStore.pos < zc->externSeqStore.size) {
              assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
@@ -14321,7 +20535,7 @@ index f620cafca633..c1c316e9e289 100644
 +            /* External matchfinder + LDM is technically possible, just not implemented yet.
 +             * We need to revisit soon and implement it. */
 +            RETURN_ERROR_IF(
-+                zc->appliedParams.useSequenceProducer,
++                ZSTD_hasExtSeqProd(&zc->appliedParams),
 +                parameter_combination_unsupported,
 +                "Long-distance matching with external sequence producer enabled is not currently supported."
 +            );
@@ -14329,14 +20543,14 @@ index f620cafca633..c1c316e9e289 100644
              /* Updates ldmSeqStore.pos */
              lastLLSize =
                  ZSTD_ldm_blockCompress(&zc->externSeqStore,
-@@ -2774,6 +3118,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+@@ -2774,6 +3174,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
          } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
              rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
  
 +            /* External matchfinder + LDM is technically possible, just not implemented yet.
 +             * We need to revisit soon and implement it. */
 +            RETURN_ERROR_IF(
-+                zc->appliedParams.useSequenceProducer,
++                ZSTD_hasExtSeqProd(&zc->appliedParams),
 +                parameter_combination_unsupported,
 +                "Long-distance matching with external sequence producer enabled is not currently supported."
 +            );
@@ -14344,23 +20558,26 @@ index f620cafca633..c1c316e9e289 100644
              ldmSeqStore.seq = zc->ldmSequences;
              ldmSeqStore.capacity = zc->maxNbLdmSequences;
              /* Updates ldmSeqStore.size */
-@@ -2788,7 +3140,68 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+@@ -2788,10 +3196,74 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                                         zc->appliedParams.useRowMatchFinder,
                                         src, srcSize);
              assert(ldmSeqStore.pos == ldmSeqStore.size);
 -        } else {   /* not long range mode */
-+        } else if (zc->appliedParams.useSequenceProducer) {
+-            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
+-                                                                                    zc->appliedParams.useRowMatchFinder,
+-                                                                                    dictMode);
++        } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) {
 +            assert(
-+                zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize)
++                zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize)
 +            );
-+            assert(zc->externalMatchCtx.mFinder != NULL);
++            assert(zc->appliedParams.extSeqProdFunc != NULL);
 +
 +            {   U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
 +
-+                size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)(
-+                    zc->externalMatchCtx.mState,
-+                    zc->externalMatchCtx.seqBuffer,
-+                    zc->externalMatchCtx.seqBufferCapacity,
++                size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)(
++                    zc->appliedParams.extSeqProdState,
++                    zc->extSeqBuf,
++                    zc->extSeqBufCapacity,
 +                    src, srcSize,
 +                    NULL, 0,  /* dict and dictSize, currently not supported */
 +                    zc->appliedParams.compressionLevel,
@@ -14368,21 +20585,21 @@ index f620cafca633..c1c316e9e289 100644
 +                );
 +
 +                size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
-+                    zc->externalMatchCtx.seqBuffer,
++                    zc->extSeqBuf,
 +                    nbExternalSeqs,
-+                    zc->externalMatchCtx.seqBufferCapacity,
++                    zc->extSeqBufCapacity,
 +                    srcSize
 +                );
 +
 +                /* Return early if there is no error, since we don't need to worry about last literals */
 +                if (!ZSTD_isError(nbPostProcessedSeqs)) {
 +                    ZSTD_sequencePosition seqPos = {0,0,0};
-+                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs);
++                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs);
 +                    RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
 +                    FORWARD_IF_ERROR(
 +                        ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
 +                            zc, &seqPos,
-+                            zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs,
++                            zc->extSeqBuf, nbPostProcessedSeqs,
 +                            src, srcSize,
 +                            zc->appliedParams.searchForExternalRepcodes
 +                        ),
@@ -14399,9 +20616,11 @@ index f620cafca633..c1c316e9e289 100644
 +                }
 +
 +                /* Fallback to software matchfinder */
-+                {   ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
-+                                                                                            zc->appliedParams.useRowMatchFinder,
-+                                                                                            dictMode);
++                {   ZSTD_blockCompressor const blockCompressor =
++                        ZSTD_selectBlockCompressor(
++                            zc->appliedParams.cParams.strategy,
++                            zc->appliedParams.useRowMatchFinder,
++                            dictMode);
 +                    ms->ldmSeqStore = NULL;
 +                    DEBUGLOG(
 +                        5,
@@ -14411,30 +20630,177 @@ index f620cafca633..c1c316e9e289 100644
 +                    lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
 +            }   }
 +        } else {   /* not long range mode and no external matchfinder */
-             ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
-                                                                                     zc->appliedParams.useRowMatchFinder,
-                                                                                     dictMode);
-@@ -2849,7 +3262,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
-         /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
-            so we provide seqStoreSeqs[i].offset - 1 */
-         ZSTD_updateRep(updatedRepcodes.rep,
--                       seqStoreSeqs[i].offBase - 1,
-+                       seqStoreSeqs[i].offBase,
-                        seqStoreSeqs[i].litLength == 0);
-         literalsRead += outSeqs[i].litLength;
-     }
-@@ -2865,6 +3278,10 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
-     zc->seqCollector.seqIndex += seqStoreSeqSize;
++            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(
++                    zc->appliedParams.cParams.strategy,
++                    zc->appliedParams.useRowMatchFinder,
++                    dictMode);
+             ms->ldmSeqStore = NULL;
+             lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+         }
+@@ -2801,29 +3273,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+     return ZSTDbss_compress;
  }
  
-+size_t ZSTD_sequenceBound(size_t srcSize) {
-+    return (srcSize / ZSTD_MINMATCH_MIN) + 1;
+-static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
++static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM])
+ {
+-    const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
+-    const seqDef* seqStoreSeqs = seqStore->sequencesStart;
+-    size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
+-    size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
+-    size_t literalsRead = 0;
+-    size_t lastLLSize;
++    const seqDef* inSeqs = seqStore->sequencesStart;
++    const size_t nbInSequences = seqStore->sequences - inSeqs;
++    const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart);
+ 
+-    ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
++    ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex;
++    const size_t nbOutSequences = nbInSequences + 1;
++    size_t nbOutLiterals = 0;
++    repcodes_t repcodes;
+     size_t i;
+-    repcodes_t updatedRepcodes;
+ 
+-    assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
+-    /* Ensure we have enough space for last literals "sequence" */
+-    assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
+-    ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
+-    for (i = 0; i < seqStoreSeqSize; ++i) {
+-        U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM;
+-        outSeqs[i].litLength = seqStoreSeqs[i].litLength;
+-        outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH;
++    /* Bounds check that we have enough space for every input sequence
++     * and the block delimiter
++     */
++    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
++    RETURN_ERROR_IF(
++        nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex),
++        dstSize_tooSmall,
++        "Not enough space to copy sequences");
++
++    ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes));
++    for (i = 0; i < nbInSequences; ++i) {
++        U32 rawOffset;
++        outSeqs[i].litLength = inSeqs[i].litLength;
++        outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH;
+         outSeqs[i].rep = 0;
+ 
++        /* Handle the possible single length >= 64K
++         * There can only be one because we add MINMATCH to every match length,
++         * and blocks are at most 128K.
++         */
+         if (i == seqStore->longLengthPos) {
+             if (seqStore->longLengthType == ZSTD_llt_literalLength) {
+                 outSeqs[i].litLength += 0x10000;
+@@ -2832,37 +3313,55 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+             }
+         }
+ 
+-        if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) {
+-            /* Derive the correct offset corresponding to a repcode */
+-            outSeqs[i].rep = seqStoreSeqs[i].offBase;
++        /* Determine the raw offset given the offBase, which may be a repcode. */
++        if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) {
++            const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase);
++            assert(repcode > 0);
++            outSeqs[i].rep = repcode;
+             if (outSeqs[i].litLength != 0) {
+-                rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1];
++                rawOffset = repcodes.rep[repcode - 1];
+             } else {
+-                if (outSeqs[i].rep == 3) {
+-                    rawOffset = updatedRepcodes.rep[0] - 1;
++                if (repcode == 3) {
++                    assert(repcodes.rep[0] > 1);
++                    rawOffset = repcodes.rep[0] - 1;
+                 } else {
+-                    rawOffset = updatedRepcodes.rep[outSeqs[i].rep];
++                    rawOffset = repcodes.rep[repcode];
+                 }
+             }
++        } else {
++            rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase);
+         }
+         outSeqs[i].offset = rawOffset;
+-        /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
+-           so we provide seqStoreSeqs[i].offset - 1 */
+-        ZSTD_updateRep(updatedRepcodes.rep,
+-                       seqStoreSeqs[i].offBase - 1,
+-                       seqStoreSeqs[i].litLength == 0);
+-        literalsRead += outSeqs[i].litLength;
++
++        /* Update repcode history for the sequence */
++        ZSTD_updateRep(repcodes.rep,
++                       inSeqs[i].offBase,
++                       inSeqs[i].litLength == 0);
++
++        nbOutLiterals += outSeqs[i].litLength;
+     }
+     /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
+      * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
+      * for the block boundary, according to the API.
+      */
+-    assert(seqStoreLiteralsSize >= literalsRead);
+-    lastLLSize = seqStoreLiteralsSize - literalsRead;
+-    outSeqs[i].litLength = (U32)lastLLSize;
+-    outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
+-    seqStoreSeqSize++;
+-    zc->seqCollector.seqIndex += seqStoreSeqSize;
++    assert(nbInLiterals >= nbOutLiterals);
++    {
++        const size_t lastLLSize = nbInLiterals - nbOutLiterals;
++        outSeqs[nbInSequences].litLength = (U32)lastLLSize;
++        outSeqs[nbInSequences].matchLength = 0;
++        outSeqs[nbInSequences].offset = 0;
++        assert(nbOutSequences == nbInSequences + 1);
++    }
++    seqCollector->seqIndex += nbOutSequences;
++    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
++
++    return 0;
 +}
 +
++size_t ZSTD_sequenceBound(size_t srcSize) {
++    const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1;
++    const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1;
++    return maxNbSeq + maxNbDelims;
+ }
+ 
  size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
-                               size_t outSeqsSize, const void* src, size_t srcSize)
- {
-@@ -2910,19 +3327,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
+@@ -2871,6 +3370,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+     const size_t dstCapacity = ZSTD_compressBound(srcSize);
+     void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
+     SeqCollector seqCollector;
++    {
++        int targetCBlockSize;
++        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), "");
++        RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0");
++    }
++    {
++        int nbWorkers;
++        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), "");
++        RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0");
++    }
+ 
+     RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");
+ 
+@@ -2880,8 +3389,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+     seqCollector.maxSequences = outSeqsSize;
+     zc->seqCollector = seqCollector;
+ 
+-    ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
+-    ZSTD_customFree(dst, ZSTD_defaultCMem);
++    {
++        const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
++        ZSTD_customFree(dst, ZSTD_defaultCMem);
++        FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed");
++    }
++    assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize));
+     return zc->seqCollector.seqIndex;
+ }
+ 
+@@ -2910,19 +3423,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
      const size_t unrollMask = unrollSize - 1;
      const size_t prefixLength = length & unrollMask;
      size_t i;
@@ -14456,7 +20822,7 @@ index f620cafca633..c1c316e9e289 100644
      return 1;
  }
  
-@@ -2938,7 +3353,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
+@@ -2938,7 +3449,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
      return nbSeqs < 4 && nbLits < 10;
  }
  
@@ -14466,7 +20832,7 @@ index f620cafca633..c1c316e9e289 100644
  {
      ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
      bs->prevCBlock = bs->nextCBlock;
-@@ -2946,7 +3362,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
+@@ -2946,7 +3458,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
  }
  
  /* Writes the block header */
@@ -14477,7 +20843,7 @@ index f620cafca633..c1c316e9e289 100644
      U32 const cBlockHeader = cSize == 1 ?
                          lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
                          lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
-@@ -2959,13 +3377,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
+@@ -2959,13 +3473,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
   *  Stores literals block type (raw, rle, compressed, repeat) and
   *  huffman description table to hufMetadata.
   *  Requires ENTROPY_WORKSPACE_SIZE workspace
@@ -14501,7 +20867,7 @@ index f620cafca633..c1c316e9e289 100644
  {
      BYTE* const wkspStart = (BYTE*)workspace;
      BYTE* const wkspEnd = wkspStart + wkspSize;
-@@ -2973,9 +3394,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+@@ -2973,9 +3490,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
      unsigned* const countWksp = (unsigned*)workspace;
      const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
      BYTE* const nodeWksp = countWkspStart + countWkspSize;
@@ -14513,7 +20879,7 @@ index f620cafca633..c1c316e9e289 100644
      HUF_repeat repeat = prevHuf->repeatMode;
      DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
  
-@@ -2990,73 +3411,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+@@ -2990,73 +3507,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
  
      /* small ? don't even attempt compression (speed opt) */
  #ifndef COMPRESS_LITERALS_SIZE_MIN
@@ -14625,7 +20991,7 @@ index f620cafca633..c1c316e9e289 100644
      }
  }
  
-@@ -3066,8 +3491,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
+@@ -3066,8 +3587,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
   * and updates nextEntropy to the appropriate repeatMode.
   */
  static ZSTD_symbolEncodingTypeStats_t
@@ -14637,7 +21003,7 @@ index f620cafca633..c1c316e9e289 100644
      nextEntropy->litlength_repeatMode = FSE_repeat_none;
      nextEntropy->offcode_repeatMode = FSE_repeat_none;
      nextEntropy->matchlength_repeatMode = FSE_repeat_none;
-@@ -3078,16 +3504,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+@@ -3078,16 +3600,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
   *  Builds entropy for the sequences.
   *  Stores symbol compression modes and fse table to fseMetadata.
   *  Requires ENTROPY_WORKSPACE_SIZE wksp.
@@ -14664,7 +21030,7 @@ index f620cafca633..c1c316e9e289 100644
      BYTE* const ostart = fseMetadata->fseTablesBuffer;
      BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
      BYTE* op = ostart;
-@@ -3114,23 +3542,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
+@@ -3114,23 +3638,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
  /* ZSTD_buildBlockEntropyStats() :
   *  Builds entropy for the block.
   *  Requires workspace size ENTROPY_WORKSPACE_SIZE
@@ -14704,7 +21070,7 @@ index f620cafca633..c1c316e9e289 100644
      FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
      entropyMetadata->fseMetadata.fseTablesSize =
          ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
-@@ -3143,11 +3576,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+@@ -3143,11 +3672,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
  }
  
  /* Returns the size estimate for the literals section (header + content) of a block */
@@ -14722,7 +21088,7 @@ index f620cafca633..c1c316e9e289 100644
  {
      unsigned* const countWksp = (unsigned*)workspace;
      unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
-@@ -3169,12 +3603,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
+@@ -3169,12 +3699,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
  }
  
  /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
@@ -14742,7 +21108,7 @@ index f620cafca633..c1c316e9e289 100644
  {
      unsigned* const countWksp = (unsigned*)workspace;
      const BYTE* ctp = codeTable;
-@@ -3206,99 +3641,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+@@ -3206,99 +3737,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
  }
  
  /* Returns the size estimate for the sequences section (header + content) of a block */
@@ -14895,7 +21261,7 @@ index f620cafca633..c1c316e9e289 100644
      return matchBytes;
  }
  
-@@ -3307,15 +3750,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
+@@ -3307,15 +3846,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
   */
  static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
                                 const seqStore_t* originalSeqStore,
@@ -14914,7 +21280,7 @@ index f620cafca633..c1c316e9e289 100644
      }
  
      /* Move longLengthPos into the correct position if necessary */
-@@ -3328,13 +3768,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+@@ -3328,13 +3864,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
      }
      resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
      resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
@@ -14931,7 +21297,7 @@ index f620cafca633..c1c316e9e289 100644
      }
      resultSeqStore->llCode += startIdx;
      resultSeqStore->mlCode += startIdx;
-@@ -3342,20 +3781,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+@@ -3342,20 +3877,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
  }
  
  /*
@@ -14968,7 +21334,7 @@ index f620cafca633..c1c316e9e289 100644
  }
  
  /*
-@@ -3371,30 +3816,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
+@@ -3371,30 +3912,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
   *        1-3 : repcode 1-3
   *        4+ : real_offset+3
   */
@@ -15013,7 +21379,7 @@ index f620cafca633..c1c316e9e289 100644
      }
  }
  
-@@ -3404,10 +3852,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
+@@ -3404,10 +3948,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
   * Returns the total size of that block (including header) or a ZSTD error code.
   */
  static size_t
@@ -15027,7 +21393,18 @@ index f620cafca633..c1c316e9e289 100644
                                    U32 lastBlock, U32 isPartition)
  {
      const U32 rleMaxLength = 25;
-@@ -3481,45 +3930,49 @@ typedef struct {
+@@ -3442,8 +3987,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+         cSeqsSize = 1;
+     }
+ 
++    /* Sequence collection not supported when block splitting */
+     if (zc->seqCollector.collectSequences) {
+-        ZSTD_copyBlockSequences(zc);
++        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed");
+         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+         return 0;
+     }
+@@ -3481,45 +4027,49 @@ typedef struct {
  
  /* Helper function to perform the recursive search for block splits.
   * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
@@ -15087,7 +21464,7 @@ index f620cafca633..c1c316e9e289 100644
          ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
          splits->splitLocations[splits->idx] = (U32)midIdx;
          splits->idx++;
-@@ -3527,14 +3980,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
+@@ -3527,14 +4077,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
      }
  }
  
@@ -15111,7 +21488,7 @@ index f620cafca633..c1c316e9e289 100644
          /* Refuse to try and split anything with less than 4 sequences */
          return 0;
      }
-@@ -3550,18 +4007,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+@@ -3550,18 +4104,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
   * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
   */
  static size_t
@@ -15138,7 +21515,7 @@ index f620cafca633..c1c316e9e289 100644
  
      /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
       * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
-@@ -3583,30 +4042,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+@@ -3583,30 +4139,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
      ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
      ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
  
@@ -15179,7 +21556,7 @@ index f620cafca633..c1c316e9e289 100644
          srcBytesTotal += srcBytes;
          if (lastPartition) {
              /* This is the final partition, need to account for possible last literals */
-@@ -3621,7 +4081,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+@@ -3621,7 +4178,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
                                                         op, dstCapacity,
                                                         ip, srcBytes,
                                                         lastBlockEntireSrc, 1 /* isPartition */);
@@ -15189,7 +21566,7 @@ index f620cafca633..c1c316e9e289 100644
          FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
  
          ip += srcBytes;
-@@ -3629,10 +4090,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
+@@ -3629,10 +4187,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
          dstCapacity -= cSizeChunk;
          cSize += cSizeChunk;
          *currSeqStore = *nextSeqStore;
@@ -15203,7 +21580,7 @@ index f620cafca633..c1c316e9e289 100644
       */
      ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
      return cSize;
-@@ -3643,8 +4104,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+@@ -3643,8 +4201,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
                                void* dst, size_t dstCapacity,
                                const void* src, size_t srcSize, U32 lastBlock)
  {
@@ -15212,16 +21589,17 @@ index f620cafca633..c1c316e9e289 100644
      U32 nbSeq;
      size_t cSize;
      DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
-@@ -3655,7 +4114,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+@@ -3655,7 +4211,8 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
          if (bss == ZSTDbss_noCompress) {
              if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
                  zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
 -            cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
++            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
 +            cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
              FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
              DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
              return cSize;
-@@ -3673,9 +4132,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+@@ -3673,9 +4230,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
                              void* dst, size_t dstCapacity,
                              const void* src, size_t srcSize, U32 frame)
  {
@@ -15234,7 +21612,25 @@ index f620cafca633..c1c316e9e289 100644
       */
      const U32 rleMaxLength = 25;
      size_t cSize;
-@@ -3767,10 +4226,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+@@ -3687,11 +4244,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+ 
+     {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+         FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+-        if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; }
++        if (bss == ZSTDbss_noCompress) {
++            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
++            cSize = 0;
++            goto out;
++        }
+     }
+ 
+     if (zc->seqCollector.collectSequences) {
+-        ZSTD_copyBlockSequences(zc);
++        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed");
+         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+         return 0;
+     }
+@@ -3767,10 +4328,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
           *   * cSize >= blockBound(srcSize): We have expanded the block too much so
           *     emit an uncompressed block.
           */
@@ -15249,7 +21645,7 @@ index f620cafca633..c1c316e9e289 100644
                  FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
                  if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
                      ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
-@@ -3778,7 +4238,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+@@ -3778,7 +4340,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
                  }
              }
          }
@@ -15258,7 +21654,7 @@ index f620cafca633..c1c316e9e289 100644
  
      DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
      /* Superblock compression failed, attempt to emit a single no compress block.
-@@ -3836,7 +4296,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
+@@ -3836,7 +4398,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
  *   All blocks will be terminated, all input will be consumed.
  *   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
  *   Frame is supposed already started (header already produced)
@@ -15267,7 +21663,7 @@ index f620cafca633..c1c316e9e289 100644
  */
  static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
                                       void* dst, size_t dstCapacity,
-@@ -3860,7 +4320,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+@@ -3860,7 +4422,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
          ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
          U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
  
@@ -15278,7 +21674,7 @@ index f620cafca633..c1c316e9e289 100644
                          dstSize_tooSmall,
                          "not enough space to store compressed block");
          if (remaining < blockSize) blockSize = remaining;
-@@ -3899,7 +4361,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
+@@ -3899,7 +4463,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
                      MEM_writeLE24(op, cBlockHeader);
                      cSize += ZSTD_blockHeaderSize;
                  }
@@ -15287,7 +21683,30 @@ index f620cafca633..c1c316e9e289 100644
  
  
              ip += blockSize;
-@@ -4078,31 +4540,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+@@ -4001,19 +4565,15 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity)
+     }
+ }
+ 
+-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
+ {
+-    RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong,
+-                    "wrong cctx stage");
+-    RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable,
+-                    parameter_unsupported,
+-                    "incompatible with ldm");
++    assert(cctx->stage == ZSTDcs_init);
++    assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable);
+     cctx->externSeqStore.seq = seq;
+     cctx->externSeqStore.size = nbSeq;
+     cctx->externSeqStore.capacity = nbSeq;
+     cctx->externSeqStore.pos = 0;
+     cctx->externSeqStore.posInSequence = 0;
+-    return 0;
+ }
+ 
+ 
+@@ -4078,31 +4638,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
      }
  }
  
@@ -15346,7 +21765,7 @@ index f620cafca633..c1c316e9e289 100644
  /*! ZSTD_loadDictionaryContent() :
   *  @return : 0, or an error code
   */
-@@ -4111,25 +4593,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+@@ -4111,25 +4691,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
                                           ZSTD_cwksp* ws,
                                           ZSTD_CCtx_params const* params,
                                           const void* src, size_t srcSize,
@@ -15391,7 +21810,7 @@ index f620cafca633..c1c316e9e289 100644
          /* If the dictionary is too large, only load the suffix of the dictionary. */
          if (srcSize > maxDictSize) {
              ip = iend - maxDictSize;
-@@ -4138,30 +4631,46 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+@@ -4138,35 +4729,58 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
          }
      }
  
@@ -15412,8 +21831,8 @@ index f620cafca633..c1c316e9e289 100644
          ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
          ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
 +        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
-+    }
-+
+     }
+ 
 +    /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
 +    if (params->cParams.strategy < ZSTD_btultra) {
 +        U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28);
@@ -15422,8 +21841,8 @@ index f620cafca633..c1c316e9e289 100644
 +            src = ip;
 +            srcSize = maxDictSize;
 +        }
-     }
- 
++    }
++
 +    ms->nextToUpdate = (U32)(ip - ms->window.base);
 +    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
 +    ms->forceNonContiguous = params->deterministicRefPrefix;
@@ -15443,11 +21862,23 @@ index f620cafca633..c1c316e9e289 100644
          break;
      case ZSTD_dfast:
 -        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
 +        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
          break;
  
      case ZSTD_greedy:
-@@ -4174,7 +4683,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+     case ZSTD_lazy:
+     case ZSTD_lazy2:
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR)
+         assert(srcSize >= HASH_READ_SIZE);
+         if (ms->dedicatedDictSearch) {
+             assert(ms->chainTable != NULL);
+@@ -4174,7 +4788,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
          } else {
              assert(params->useRowMatchFinder != ZSTD_ps_auto);
              if (params->useRowMatchFinder == ZSTD_ps_enable) {
@@ -15456,7 +21887,44 @@ index f620cafca633..c1c316e9e289 100644
                  ZSTD_memset(ms->tagTable, 0, tagTableSize);
                  ZSTD_row_update(ms, iend-HASH_READ_SIZE);
                  DEBUGLOG(4, "Using row-based hash table for lazy dict");
-@@ -4327,6 +4836,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+@@ -4183,14 +4797,23 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                 DEBUGLOG(4, "Using chain-based hash table for lazy dict");
+             }
+         }
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
+     case ZSTD_btopt:
+     case ZSTD_btultra:
+     case ZSTD_btultra2:
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+         assert(srcSize >= HASH_READ_SIZE);
+         ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
+         break;
+ 
+     default:
+@@ -4237,11 +4860,10 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+ 
+         /* We only set the loaded table as valid if it contains all non-zero
+          * weights. Otherwise, we set it to check */
+-        if (!hasZeroWeights)
++        if (!hasZeroWeights && maxSymbolValue == 255)
+             bs->entropy.huf.repeatMode = HUF_repeat_valid;
+ 
+         RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, "");
+-        RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, "");
+         dictPtr += hufHeaderSize;
+     }
+ 
+@@ -4327,6 +4949,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
                                        ZSTD_CCtx_params const* params,
                                        const void* dict, size_t dictSize,
                                        ZSTD_dictTableLoadMethod_e dtlm,
@@ -15464,7 +21932,7 @@ index f620cafca633..c1c316e9e289 100644
                                        void* workspace)
  {
      const BYTE* dictPtr = (const BYTE*)dict;
-@@ -4345,7 +4855,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+@@ -4345,7 +4968,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
      {
          size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
          FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
@@ -15473,7 +21941,7 @@ index f620cafca633..c1c316e9e289 100644
      }
      return dictID;
  }
-@@ -4361,6 +4871,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+@@ -4361,6 +4984,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
                           const void* dict, size_t dictSize,
                                 ZSTD_dictContentType_e dictContentType,
                                 ZSTD_dictTableLoadMethod_e dtlm,
@@ -15481,7 +21949,7 @@ index f620cafca633..c1c316e9e289 100644
                                 void* workspace)
  {
      DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
-@@ -4373,13 +4884,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+@@ -4373,13 +4997,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
  
      /* dict restricted modes */
      if (dictContentType == ZSTD_dct_rawContent)
@@ -15497,7 +21965,7 @@ index f620cafca633..c1c316e9e289 100644
          }
          RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
          assert(0);   /* impossible */
-@@ -4387,13 +4898,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+@@ -4387,13 +5011,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
  
      /* dict as full zstd dictionary */
      return ZSTD_loadZstdDictionary(
@@ -15513,7 +21981,7 @@ index f620cafca633..c1c316e9e289 100644
   * @return : 0, or an error code */
  static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
                                      const void* dict, size_t dictSize,
-@@ -4426,11 +4938,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+@@ -4426,11 +5051,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
                          cctx->blockState.prevCBlock, &cctx->blockState.matchState,
                          &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
                          cdict->dictContentSize, cdict->dictContentType, dtlm,
@@ -15527,7 +21995,7 @@ index f620cafca633..c1c316e9e289 100644
          FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
          assert(dictID <= UINT_MAX);
          cctx->dictID = (U32)dictID;
-@@ -4471,11 +4983,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+@@ -4471,11 +5096,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
                                              &cctxParams, pledgedSrcSize);
  }
  
@@ -15542,7 +22010,7 @@ index f620cafca633..c1c316e9e289 100644
          ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
      }
      DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
-@@ -4483,9 +4995,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
+@@ -4483,9 +5108,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
                                         &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
  }
  
@@ -15559,7 +22027,35 @@ index f620cafca633..c1c316e9e289 100644
  }
  
  
-@@ -4537,9 +5055,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
+@@ -4496,14 +5127,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+ {
+     BYTE* const ostart = (BYTE*)dst;
+     BYTE* op = ostart;
+-    size_t fhSize = 0;
+ 
+     DEBUGLOG(4, "ZSTD_writeEpilogue");
+     RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing");
+ 
+     /* special case : empty frame */
+     if (cctx->stage == ZSTDcs_init) {
+-        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
++        size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
+         FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
+         dstCapacity -= fhSize;
+         op += fhSize;
+@@ -4513,8 +5143,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+     if (cctx->stage != ZSTDcs_ending) {
+         /* write one last empty block, make it the "last" block */
+         U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
+-        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue");
+-        MEM_writeLE32(op, cBlockHeader24);
++        ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3);
++        RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue");
++        MEM_writeLE24(op, cBlockHeader24);
+         op += ZSTD_blockHeaderSize;
+         dstCapacity -= ZSTD_blockHeaderSize;
+     }
+@@ -4537,9 +5168,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
      (void)extraCSize;
  }
  
@@ -15572,7 +22068,7 @@ index f620cafca633..c1c316e9e289 100644
  {
      size_t endResult;
      size_t const cSize = ZSTD_compressContinue_internal(cctx,
-@@ -4563,6 +5081,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+@@ -4563,6 +5194,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
      return cSize + endResult;
  }
  
@@ -15587,7 +22083,7 @@ index f620cafca633..c1c316e9e289 100644
  size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
                                 void* dst, size_t dstCapacity,
                           const void* src, size_t srcSize,
-@@ -4591,7 +5117,7 @@ size_t ZSTD_compress_advanced_internal(
+@@ -4591,7 +5230,7 @@ size_t ZSTD_compress_advanced_internal(
      FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
                           dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
                           params, srcSize, ZSTDb_not_buffered) , "");
@@ -15596,7 +22092,7 @@ index f620cafca633..c1c316e9e289 100644
  }
  
  size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
-@@ -4709,7 +5235,7 @@ static size_t ZSTD_initCDict_internal(
+@@ -4709,7 +5348,7 @@ static size_t ZSTD_initCDict_internal(
          {   size_t const dictID = ZSTD_compress_insertDictionary(
                      &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
                      &params, cdict->dictContent, cdict->dictContentSize,
@@ -15605,7 +22101,16 @@ index f620cafca633..c1c316e9e289 100644
              FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
              assert(dictID <= (size_t)(U32)-1);
              cdict->dictID = (U32)dictID;
-@@ -4906,6 +5432,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
+@@ -4811,7 +5450,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2(
+                         cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch,
+                         customMem);
+ 
+-    if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
++    if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                     dict, dictSize,
+                                     dictLoadMethod, dictContentType,
+                                     cctxParams) )) {
+@@ -4906,6 +5545,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
      params.cParams = cParams;
      params.useRowMatchFinder = useRowMatchFinder;
      cdict->useRowMatchFinder = useRowMatchFinder;
@@ -15613,7 +22118,7 @@ index f620cafca633..c1c316e9e289 100644
  
      if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
                                                dict, dictSize,
-@@ -4985,12 +5512,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
+@@ -4985,12 +5625,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
  
  /* ZSTD_compressBegin_usingCDict() :
   * cdict must be != NULL */
@@ -15632,7 +22137,7 @@ index f620cafca633..c1c316e9e289 100644
  /*! ZSTD_compress_usingCDict_internal():
   * Implementation of various ZSTD_compress_usingCDict* functions.
   */
-@@ -5000,7 +5532,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
+@@ -5000,7 +5645,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
                                  const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
  {
      FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
@@ -15641,7 +22146,7 @@ index f620cafca633..c1c316e9e289 100644
  }
  
  /*! ZSTD_compress_usingCDict_advanced():
-@@ -5197,30 +5729,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+@@ -5197,30 +5842,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
  
  static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
  {
@@ -15689,13 +22194,13 @@ index f620cafca633..c1c316e9e289 100644
 +    if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
 +        assert(input->pos >= zcs->stableIn_notConsumed);
 +        input->pos -= zcs->stableIn_notConsumed;
-+        ip -= zcs->stableIn_notConsumed;
++        if (ip) ip -= zcs->stableIn_notConsumed;
 +        zcs->stableIn_notConsumed = 0;
 +    }
      if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
          assert(zcs->inBuff != NULL);
          assert(zcs->inBuffSize > 0);
-@@ -5229,8 +5772,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+@@ -5229,8 +5885,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
          assert(zcs->outBuff !=  NULL);
          assert(zcs->outBuffSize > 0);
      }
@@ -15707,7 +22212,7 @@ index f620cafca633..c1c316e9e289 100644
      assert((U32)flushMode <= (U32)ZSTD_e_end);
  
      while (someMoreWork) {
-@@ -5245,7 +5790,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+@@ -5245,7 +5903,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                  || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)  /* OR we are allowed to return dstSizeTooSmall */
                && (zcs->inBuffPos == 0) ) {
                  /* shortcut to compression pass directly into output buffer */
@@ -15716,7 +22221,7 @@ index f620cafca633..c1c316e9e289 100644
                                                  op, oend-op, ip, iend-ip);
                  DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
                  FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
-@@ -5262,8 +5807,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+@@ -5262,8 +5920,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                                          zcs->inBuff + zcs->inBuffPos, toLoad,
                                          ip, iend-ip);
                  zcs->inBuffPos += loaded;
@@ -15726,7 +22231,7 @@ index f620cafca633..c1c316e9e289 100644
                  if ( (flushMode == ZSTD_e_continue)
                    && (zcs->inBuffPos < zcs->inBuffTarget) ) {
                      /* not enough input to fill full block : stop here */
-@@ -5274,6 +5818,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+@@ -5274,6 +5931,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                      /* empty */
                      someMoreWork = 0; break;
                  }
@@ -15747,7 +22252,7 @@ index f620cafca633..c1c316e9e289 100644
              }
              /* compress current block (note : this stage cannot be stopped in the middle) */
              DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
-@@ -5281,9 +5839,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+@@ -5281,9 +5952,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                  void* cDst;
                  size_t cSize;
                  size_t oSize = oend-op;
@@ -15759,7 +22264,7 @@ index f620cafca633..c1c316e9e289 100644
                  if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
                      cDst = op;   /* compress into output buffer, to skip flush stage */
                  else
-@@ -5291,9 +5848,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+@@ -5291,9 +5961,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                  if (inputBuffered) {
                      unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
                      cSize = lastBlock ?
@@ -15771,7 +22276,7 @@ index f620cafca633..c1c316e9e289 100644
                                          zcs->inBuff + zcs->inToCompress, iSize);
                      FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
                      zcs->frameEnded = lastBlock;
-@@ -5306,19 +5863,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+@@ -5306,19 +5976,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                      if (!lastBlock)
                          assert(zcs->inBuffTarget <= zcs->inBuffSize);
                      zcs->inToCompress = zcs->inBuffPos;
@@ -15797,7 +22302,7 @@ index f620cafca633..c1c316e9e289 100644
                  }
                  if (cDst == op) {  /* no need to flush */
                      op += cSize;
-@@ -5388,8 +5942,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
+@@ -5388,8 +6055,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
  /* After a compression call set the expected input/output buffer.
   * This is validated at the start of the next compression call.
   */
@@ -15809,7 +22314,7 @@ index f620cafca633..c1c316e9e289 100644
      if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
          cctx->expectedInBuffer = *input;
      }
-@@ -5408,22 +5964,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
+@@ -5408,22 +6077,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
  {
      if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
          ZSTD_inBuffer const expect = cctx->expectedInBuffer;
@@ -15838,7 +22343,7 @@ index f620cafca633..c1c316e9e289 100644
      ZSTD_CCtx_params params = cctx->requestedParams;
      ZSTD_prefixDict const prefixDict = cctx->prefixDict;
      FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
-@@ -5437,9 +5993,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+@@ -5437,9 +6106,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
          params.compressionLevel = cctx->cdict->compressionLevel;
      }
      DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
@@ -15851,7 +22356,7 @@ index f620cafca633..c1c316e9e289 100644
                  ? prefixDict.dictSize
                  : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
          ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
-@@ -5451,6 +6007,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+@@ -5451,6 +6120,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
      params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
      params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
      params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
@@ -15861,7 +22366,7 @@ index f620cafca633..c1c316e9e289 100644
  
      {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
          assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
-@@ -5477,6 +6036,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
+@@ -5477,6 +6149,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
      return 0;
  }
  
@@ -15870,7 +22375,7 @@ index f620cafca633..c1c316e9e289 100644
  size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
                               ZSTD_outBuffer* output,
                               ZSTD_inBuffer* input,
-@@ -5491,8 +6052,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+@@ -5491,8 +6165,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
  
      /* transparent initialization stage */
      if (cctx->streamStage == zcss_init) {
@@ -15900,7 +22405,7 @@ index f620cafca633..c1c316e9e289 100644
      }
      /* end of transparent initialization stage */
  
-@@ -5510,13 +6090,20 @@ size_t ZSTD_compressStream2_simpleArgs (
+@@ -5510,13 +6203,20 @@ size_t ZSTD_compressStream2_simpleArgs (
                        const void* src, size_t srcSize, size_t* srcPos,
                              ZSTD_EndDirective endOp)
  {
@@ -15927,7 +22432,7 @@ index f620cafca633..c1c316e9e289 100644
  }
  
  size_t ZSTD_compress2(ZSTD_CCtx* cctx,
-@@ -5539,6 +6126,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+@@ -5539,6 +6239,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
          /* Reset to the original values. */
          cctx->requestedParams.inBufferMode = originalInBufferMode;
          cctx->requestedParams.outBufferMode = originalOutBufferMode;
@@ -15935,7 +22440,7 @@ index f620cafca633..c1c316e9e289 100644
          FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
          if (result != 0) {  /* compression not completed, due to lack of output space */
              assert(oPos == dstCapacity);
-@@ -5549,64 +6137,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+@@ -5549,64 +6250,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
      }
  }
  
@@ -16020,7 +22525,7 @@ index f620cafca633..c1c316e9e289 100644
      if (cctx->cdict) {
          dictSize = (U32)cctx->cdict->dictContentSize;
      } else if (cctx->prefixDict.dict) {
-@@ -5615,25 +6200,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+@@ -5615,25 +6313,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
          dictSize = 0;
      }
      ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
@@ -16032,8 +22537,7 @@ index f620cafca633..c1c316e9e289 100644
 -        U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
 -        ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
 +        U32 offBase;
- 
--        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
++
 +        if (externalRepSearch == ZSTD_ps_disable) {
 +            offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
 +        } else {
@@ -16041,14 +22545,15 @@ index f620cafca633..c1c316e9e289 100644
 +            offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
 +            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
 +        }
-+
+ 
+-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
 +        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
          if (cctx->appliedParams.validateSequences) {
              seqPos->posInSrc += litLength + matchLength;
 -            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
 -                                                cctx->appliedParams.cParams.windowLog, dictSize),
 +            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
-+                                                cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
++                                                cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
                                                  "Sequence validation failed");
          }
 -        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
@@ -16085,7 +22590,7 @@ index f620cafca633..c1c316e9e289 100644
      ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
  
      if (inSeqs[idx].litLength) {
-@@ -5642,26 +6257,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+@@ -5642,26 +6370,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
          ip += inSeqs[idx].litLength;
          seqPos->posInSrc += inSeqs[idx].litLength;
      }
@@ -16115,7 +22620,7 @@ index f620cafca633..c1c316e9e289 100644
  {
      U32 idx = seqPos->idx;
      U32 startPosInSequence = seqPos->posInSequence;
-@@ -5673,6 +6277,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+@@ -5673,6 +6390,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
      U32 bytesAdjustment = 0;
      U32 finalMatchSplit = 0;
  
@@ -16125,7 +22630,7 @@ index f620cafca633..c1c316e9e289 100644
      if (cctx->cdict) {
          dictSize = cctx->cdict->dictContentSize;
      } else if (cctx->prefixDict.dict) {
-@@ -5680,7 +6287,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+@@ -5680,7 +6400,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
      } else {
          dictSize = 0;
      }
@@ -16134,7 +22639,7 @@ index f620cafca633..c1c316e9e289 100644
      DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
      ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
      while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
-@@ -5688,7 +6295,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+@@ -5688,7 +6408,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
          U32 litLength = currSeq.litLength;
          U32 matchLength = currSeq.matchLength;
          U32 const rawOffset = currSeq.offset;
@@ -16143,7 +22648,7 @@ index f620cafca633..c1c316e9e289 100644
  
          /* Modify the sequence depending on where endPosInSequence lies */
          if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
-@@ -5702,7 +6309,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+@@ -5702,7 +6422,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
              /* Move to the next sequence */
              endPosInSequence -= currSeq.litLength + currSeq.matchLength;
              startPosInSequence = 0;
@@ -16151,7 +22656,7 @@ index f620cafca633..c1c316e9e289 100644
          } else {
              /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
                 does not reach the end of the match. So, we have to split the sequence */
-@@ -5742,21 +6348,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+@@ -5742,21 +6461,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
          }
          /* Check if this offset can be represented with a repcode */
          {   U32 const ll0 = (litLength == 0);
@@ -16166,7 +22671,7 @@ index f620cafca633..c1c316e9e289 100644
 -            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
 -                                                   cctx->appliedParams.cParams.windowLog, dictSize),
 +            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
-+                                                   cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer),
++                                                   cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
                                                     "Sequence validation failed");
          }
 -        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
@@ -16182,7 +22687,7 @@ index f620cafca633..c1c316e9e289 100644
      }
      DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
      assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
-@@ -5779,7 +6387,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
+@@ -5779,7 +6500,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
  
  typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
                                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
@@ -16191,7 +22696,7 @@ index f620cafca633..c1c316e9e289 100644
  static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
  {
      ZSTD_sequenceCopier sequenceCopier = NULL;
-@@ -5793,6 +6401,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
+@@ -5793,6 +6514,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
      return sequenceCopier;
  }
  
@@ -16249,7 +22754,7 @@ index f620cafca633..c1c316e9e289 100644
  /* Compress, block-by-block, all of the sequences given.
   *
   * Returns the cumulative size of all compressed blocks (including their headers),
-@@ -5805,9 +6464,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+@@ -5805,9 +6577,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
                            const void* src, size_t srcSize)
  {
      size_t cSize = 0;
@@ -16259,7 +22764,7 @@ index f620cafca633..c1c316e9e289 100644
      size_t remaining = srcSize;
      ZSTD_sequencePosition seqPos = {0, 0, 0};
  
-@@ -5827,22 +6483,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+@@ -5827,22 +6596,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
      }
  
      while (remaining) {
@@ -16295,7 +22800,7 @@ index f620cafca633..c1c316e9e289 100644
              cSize += cBlockSize;
              ip += blockSize;
              op += cBlockSize;
-@@ -5851,6 +6514,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+@@ -5851,6 +6627,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
              continue;
          }
  
@@ -16303,7 +22808,7 @@ index f620cafca633..c1c316e9e289 100644
          compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
                                  &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
                                  &cctx->appliedParams,
-@@ -5859,11 +6523,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+@@ -5859,11 +6636,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
                                  cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
                                  cctx->bmi2);
          FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
@@ -16317,7 +22822,7 @@ index f620cafca633..c1c316e9e289 100644
              /* We don't want to emit our first block as a RLE even if it qualifies because
              * doing so will cause the decoder (cli only) to throw a "should consume all input error."
              * This is only an issue for zstd <= v1.4.3
-@@ -5874,12 +6538,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+@@ -5874,12 +6651,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
          if (compressedSeqsSize == 0) {
              /* ZSTD_noCompressBlock writes the block header as well */
              cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
@@ -16334,7 +22839,7 @@ index f620cafca633..c1c316e9e289 100644
          } else {
              U32 cBlockHeader;
              /* Error checking and repcodes update */
-@@ -5891,11 +6555,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+@@ -5891,11 +6668,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
              cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
              MEM_writeLE24(op, cBlockHeader);
              cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
@@ -16347,7 +22852,7 @@ index f620cafca633..c1c316e9e289 100644
  
          if (lastBlock) {
              break;
-@@ -5906,12 +6569,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+@@ -5906,12 +6682,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
              dstCapacity -= cBlockSize;
              cctx->isFirstBlock = 0;
          }
@@ -16364,7 +22869,7 @@ index f620cafca633..c1c316e9e289 100644
                                const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
                                const void* src, size_t srcSize)
  {
-@@ -5921,7 +6587,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+@@ -5921,7 +6700,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
      size_t frameHeaderSize = 0;
  
      /* Transparent initialization stage, same as compressStream2() */
@@ -16373,7 +22878,7 @@ index f620cafca633..c1c316e9e289 100644
      assert(cctx != NULL);
      FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
      /* Begin writing output, starting with frame header */
-@@ -5949,26 +6615,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
+@@ -5949,26 +6728,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
          cSize += 4;
      }
  
@@ -16412,7 +22917,7 @@ index f620cafca633..c1c316e9e289 100644
      if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
      /* single thread mode : attempt to calculate remaining to flush more precisely */
      {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
-@@ -6090,7 +6764,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
+@@ -6090,7 +6877,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
              cp.targetLength = (unsigned)(-clampedCompressionLevel);
          }
          /* refine parameters based on srcSize & dictSize */
@@ -16421,30 +22926,38 @@ index f620cafca633..c1c316e9e289 100644
      }
  }
  
-@@ -6125,3 +6799,21 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
+@@ -6125,3 +6912,29 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
      if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
      return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
  }
 +
 +void ZSTD_registerSequenceProducer(
-+    ZSTD_CCtx* zc, void* mState,
-+    ZSTD_sequenceProducer_F* mFinder
++    ZSTD_CCtx* zc,
++    void* extSeqProdState,
++    ZSTD_sequenceProducer_F extSeqProdFunc
 +) {
-+    if (mFinder != NULL) {
-+        ZSTD_externalMatchCtx emctx;
-+        emctx.mState = mState;
-+        emctx.mFinder = mFinder;
-+        emctx.seqBuffer = NULL;
-+        emctx.seqBufferCapacity = 0;
-+        zc->externalMatchCtx = emctx;
-+        zc->requestedParams.useSequenceProducer = 1;
++    assert(zc != NULL);
++    ZSTD_CCtxParams_registerSequenceProducer(
++        &zc->requestedParams, extSeqProdState, extSeqProdFunc
++    );
++}
++
++void ZSTD_CCtxParams_registerSequenceProducer(
++  ZSTD_CCtx_params* params,
++  void* extSeqProdState,
++  ZSTD_sequenceProducer_F extSeqProdFunc
++) {
++    assert(params != NULL);
++    if (extSeqProdFunc != NULL) {
++        params->extSeqProdFunc = extSeqProdFunc;
++        params->extSeqProdState = extSeqProdState;
 +    } else {
-+        ZSTD_memset(&zc->externalMatchCtx, 0, sizeof(zc->externalMatchCtx));
-+        zc->requestedParams.useSequenceProducer = 0;
++        params->extSeqProdFunc = NULL;
++        params->extSeqProdState = NULL;
 +    }
 +}
 diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
-index 71697a11ae30..899f5e2de8e9 100644
+index 71697a11ae30..53cb582a8d2b 100644
 --- a/lib/zstd/compress/zstd_compress_internal.h
 +++ b/lib/zstd/compress/zstd_compress_internal.h
 @@ -1,5 +1,6 @@
@@ -16463,6 +22976,15 @@ index 71697a11ae30..899f5e2de8e9 100644
  
  
  /*-*************************************
+@@ -32,7 +34,7 @@
+                                        It's not a big deal though : candidate will just be sorted again.
+                                        Additionally, candidate position 1 will be lost.
+                                        But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+-                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy.
++                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy.
+                                        This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
+ 
+ 
 @@ -111,12 +113,13 @@ typedef struct {
  /* ZSTD_buildBlockEntropyStats() :
   *  Builds entropy for the block.
@@ -16483,7 +23005,7 @@ index 71697a11ae30..899f5e2de8e9 100644
  
  /* *******************************
  *  Compression internals structs *
-@@ -142,6 +145,12 @@ typedef struct {
+@@ -142,26 +145,33 @@ typedef struct {
    size_t capacity;      /* The capacity starting from `seq` pointer */
  } rawSeqStore_t;
  
@@ -16496,19 +23018,47 @@ index 71697a11ae30..899f5e2de8e9 100644
  UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
  
  typedef struct {
-@@ -212,8 +221,10 @@ struct ZSTD_matchState_t {
+-    int price;
+-    U32 off;
+-    U32 mlen;
+-    U32 litlen;
+-    U32 rep[ZSTD_REP_NUM];
++    int price;  /* price from beginning of segment to this position */
++    U32 off;    /* offset of previous match */
++    U32 mlen;   /* length of previous match */
++    U32 litlen; /* nb of literals since previous match */
++    U32 rep[ZSTD_REP_NUM];  /* offset history after previous match */
+ } ZSTD_optimal_t;
+ 
+ typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
+ 
++#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3)
+ typedef struct {
+     /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
+     unsigned* litFreq;           /* table of literals statistics, of size 256 */
+     unsigned* litLengthFreq;     /* table of litLength statistics, of size (MaxLL+1) */
+     unsigned* matchLengthFreq;   /* table of matchLength statistics, of size (MaxML+1) */
+     unsigned* offCodeFreq;       /* table of offCode statistics, of size (MaxOff+1) */
+-    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_NUM+1 */
+-    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
++    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_SIZE */
++    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */
+ 
+     U32  litSum;                 /* nb of literals */
+     U32  litLengthSum;           /* nb of litLength codes */
+@@ -212,8 +222,10 @@ struct ZSTD_matchState_t {
      U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
  
      U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
 -    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
 +    BYTE* tagTable;                          /* For row-based matchFinder: A row-based table containing the hashes and head index. */
      U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
-+    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for re-use of tag table */
++    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for reuse of tag table */
 +    U32 hashSaltEntropy;                     /* For row-based matchFinder: collects entropy for salt generation */
  
      U32* hashTable;
      U32* hashTable3;
-@@ -228,6 +239,18 @@ struct ZSTD_matchState_t {
+@@ -228,6 +240,18 @@ struct ZSTD_matchState_t {
      const ZSTD_matchState_t* dictMatchState;
      ZSTD_compressionParameters cParams;
      const rawSeqStore_t* ldmSeqStore;
@@ -16527,7 +23077,7 @@ index 71697a11ae30..899f5e2de8e9 100644
  };
  
  typedef struct {
-@@ -324,6 +347,24 @@ struct ZSTD_CCtx_params_s {
+@@ -324,6 +348,25 @@ struct ZSTD_CCtx_params_s {
  
      /* Internal use, for createCCtxParams() and freeCCtxParams() only */
      ZSTD_customMem customMem;
@@ -16539,10 +23089,11 @@ index 71697a11ae30..899f5e2de8e9 100644
 +     * if the external matchfinder returns an error code. */
 +    int enableMatchFinderFallback;
 +
-+    /* Indicates whether an external matchfinder has been referenced.
-+     * Users can't set this externally.
-+     * It is set internally in ZSTD_registerSequenceProducer(). */
-+    int useSequenceProducer;
++    /* Parameters for the external sequence producer API.
++     * Users set these parameters through ZSTD_registerSequenceProducer().
++     * It is not possible to set these parameters individually through the public API. */
++    void* extSeqProdState;
++    ZSTD_sequenceProducer_F extSeqProdFunc;
 +
 +    /* Adjust the max block size*/
 +    size_t maxBlockSize;
@@ -16552,22 +23103,7 @@ index 71697a11ae30..899f5e2de8e9 100644
  };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
  
  #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
-@@ -355,6 +396,14 @@ typedef struct {
-     ZSTD_entropyCTablesMetadata_t entropyMetadata;
- } ZSTD_blockSplitCtx;
- 
-+/* Context for block-level external matchfinder API */
-+typedef struct {
-+  void* mState;
-+  ZSTD_sequenceProducer_F* mFinder;
-+  ZSTD_Sequence* seqBuffer;
-+  size_t seqBufferCapacity;
-+} ZSTD_externalMatchCtx;
-+
- struct ZSTD_CCtx_s {
-     ZSTD_compressionStage_e stage;
-     int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
-@@ -404,6 +453,7 @@ struct ZSTD_CCtx_s {
+@@ -404,6 +447,7 @@ struct ZSTD_CCtx_s {
  
      /* Stable in/out buffer verification */
      ZSTD_inBuffer expectedInBuffer;
@@ -16575,13 +23111,14 @@ index 71697a11ae30..899f5e2de8e9 100644
      size_t expectedOutBufferSize;
  
      /* Dictionary */
-@@ -417,9 +467,13 @@ struct ZSTD_CCtx_s {
+@@ -417,9 +461,14 @@ struct ZSTD_CCtx_s {
  
      /* Workspace for block splitter */
      ZSTD_blockSplitCtx blockSplitCtx;
 +
-+    /* Workspace for external matchfinder */
-+    ZSTD_externalMatchCtx externalMatchCtx;
++    /* Buffer for output from external sequence producer */
++    ZSTD_Sequence* extSeqBuf;
++    size_t extSeqBufCapacity;
  };
  
  typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
@@ -16589,7 +23126,7 @@ index 71697a11ae30..899f5e2de8e9 100644
  
  typedef enum {
      ZSTD_noDict = 0,
-@@ -441,7 +495,7 @@ typedef enum {
+@@ -441,7 +490,7 @@ typedef enum {
                                   * In this mode we take both the source size and the dictionary size
                                   * into account when selecting and adjusting the parameters.
                                   */
@@ -16598,7 +23135,7 @@ index 71697a11ae30..899f5e2de8e9 100644
                                   * We don't know what these parameters are for. We default to the legacy
                                   * behavior of taking both the source size and the dict size into account
                                   * when selecting and adjusting parameters.
-@@ -500,9 +554,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
+@@ -500,9 +549,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
  /* ZSTD_noCompressBlock() :
   * Writes uncompressed block to dst buffer from given src.
   * Returns the size of the block */
@@ -16611,7 +23148,7 @@ index 71697a11ae30..899f5e2de8e9 100644
      RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
                      dstSize_tooSmall, "dst buf too small for uncompressed block");
      MEM_writeLE24(dst, cBlockHeader24);
-@@ -510,7 +566,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
+@@ -510,7 +561,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
      return ZSTD_blockHeaderSize + srcSize;
  }
  
@@ -16621,7 +23158,7 @@ index 71697a11ae30..899f5e2de8e9 100644
  {
      BYTE* const op = (BYTE*)dst;
      U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
-@@ -529,7 +586,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+@@ -529,7 +581,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
  {
      U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
      ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
@@ -16630,7 +23167,7 @@ index 71697a11ae30..899f5e2de8e9 100644
      return (srcSize >> minlog) + 2;
  }
  
-@@ -565,29 +622,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
+@@ -565,29 +617,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
      while (ip < iend) *op++ = *ip++;
  }
  
@@ -16674,7 +23211,7 @@ index 71697a11ae30..899f5e2de8e9 100644
                size_t matchLength)
  {
      BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
-@@ -596,8 +651,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+@@ -596,8 +646,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
      static const BYTE* g_start = NULL;
      if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
      {   U32 const pos = (U32)((const BYTE*)literals - g_start);
@@ -16685,7 +23222,7 @@ index 71697a11ae30..899f5e2de8e9 100644
      }
  #endif
      assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
-@@ -607,9 +662,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+@@ -607,9 +657,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
      assert(literals + litLength <= litLimit);
      if (litEnd <= litLimit_w) {
          /* Common case we can use wildcopy.
@@ -16698,7 +23235,7 @@ index 71697a11ae30..899f5e2de8e9 100644
          ZSTD_copy16(seqStorePtr->lit, literals);
          if (litLength > 16) {
              ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
-@@ -628,7 +683,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+@@ -628,7 +678,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
      seqStorePtr->sequences[0].litLength = (U16)litLength;
  
      /* match offset */
@@ -16707,7 +23244,7 @@ index 71697a11ae30..899f5e2de8e9 100644
  
      /* match Length */
      assert(matchLength >= MINMATCH);
-@@ -646,17 +701,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
+@@ -646,17 +696,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
  
  /* ZSTD_updateRep() :
   * updates in-place @rep (array of repeat offsets)
@@ -16730,7 +23267,7 @@ index 71697a11ae30..899f5e2de8e9 100644
          if (repCode > 0) {  /* note : if repCode==0, no change */
              U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
              rep[2] = (repCode >= 2) ? rep[1] : rep[2];
-@@ -673,11 +728,11 @@ typedef struct repcodes_s {
+@@ -673,11 +723,11 @@ typedef struct repcodes_s {
  } repcodes_t;
  
  MEM_STATIC repcodes_t
@@ -16744,7 +23281,7 @@ index 71697a11ae30..899f5e2de8e9 100644
      return newReps;
  }
  
-@@ -685,59 +740,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
+@@ -685,59 +735,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
  /*-*************************************
  *  Match length counter
  ***************************************/
@@ -16804,7 +23341,7 @@ index 71697a11ae30..899f5e2de8e9 100644
  MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
  {
      const BYTE* const pStart = pIn;
-@@ -783,32 +785,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+@@ -783,32 +780,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
   *  Hashes
   ***************************************/
  static const U32 prime3bytes = 506832829U;
@@ -16860,7 +23397,7 @@ index 71697a11ae30..899f5e2de8e9 100644
      switch(mls)
      {
      default:
-@@ -820,6 +833,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+@@ -820,6 +828,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
      }
  }
  
@@ -16885,7 +23422,18 @@ index 71697a11ae30..899f5e2de8e9 100644
  /* ZSTD_ipow() :
   * Return base^exponent.
   */
-@@ -1167,10 +1198,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
+@@ -1011,7 +1037,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
+  * The least significant cycleLog bits of the indices must remain the same,
+  * which may be 0. Every index up to maxDist in the past must be valid.
+  */
+-MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
+                                            U32 maxDist, void const* src)
+ {
+     /* preemptive overflow correction:
+@@ -1167,10 +1195,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
                      (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
          assert(blockEndIdx >= loadedDictEnd);
  
@@ -16902,7 +23450,18 @@ index 71697a11ae30..899f5e2de8e9 100644
               */
              DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
              *loadedDictEndPtr = 0;
-@@ -1302,6 +1338,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+@@ -1199,7 +1232,9 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
+  * forget about the extDict. Handles overlap of the prefix and extDict.
+  * Returns non-zero if the segment is contiguous.
+  */
+-MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
++MEM_STATIC
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_window_update(ZSTD_window_t* window,
+                                   void const* src, size_t srcSize,
+                                   int forceNonContiguous)
+ {
+@@ -1302,6 +1337,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
  
  #endif
  
@@ -16945,7 +23504,20 @@ index 71697a11ae30..899f5e2de8e9 100644
  
  
  /* ===============================================================
-@@ -1396,4 +1468,51 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
+@@ -1381,11 +1452,10 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
+  * This cannot be used when long range matching is enabled.
+  * Zstd will use these sequences, and pass the literals to a secondary block
+  * compressor.
+- * @return : An error code on failure.
+  * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
+  * access and data corruption.
+  */
+-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
+ 
+ /* ZSTD_cycleLog() :
+  *  condition for correct operation : hashLog > 1 */
+@@ -1396,4 +1466,55 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
   */
  void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
  
@@ -16976,6 +23548,10 @@ index 71697a11ae30..899f5e2de8e9 100644
 +                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
 +                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
 +
++/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */
++MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) {
++    return params->extSeqProdFunc != NULL;
++}
 +
 +/* ===============================================================
 + * Deprecated definitions that are still used internally to avoid
@@ -17326,7 +23902,7 @@ index 7991364c2f71..7fe6f4ff5cf2 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c
-index 17d836cc84e8..dbacbaf72733 100644
+index 17d836cc84e8..41f6521b27cd 100644
 --- a/lib/zstd/compress/zstd_compress_superblock.c
 +++ b/lib/zstd/compress/zstd_compress_superblock.c
 @@ -1,5 +1,6 @@
@@ -17375,24 +23951,63 @@ index 17d836cc84e8..dbacbaf72733 100644
 -    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
 -                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
 +    {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
-+        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable, flags)
-+                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable, flags);
++        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags)
++                                          : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags);
          op += cSize;
          cLitSize += cSize;
          if (cSize == 0 || ERR_isError(cSize)) {
-@@ -126,7 +126,11 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
-     return op-ostart;
+@@ -103,7 +103,7 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     switch(lhSize)
+     {
+     case 3: /* 2 - 2 - 10 - 10 */
+-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
++        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
+             MEM_writeLE24(ostart, lhc);
+             break;
+         }
+@@ -123,26 +123,30 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+     }
+     *entropyWritten = 1;
+     DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
+-    return op-ostart;
++    return (size_t)(op-ostart);
  }
  
 -static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
+-    const seqDef* const sstart = sequences;
+-    const seqDef* const send = sequences + nbSeq;
+-    const seqDef* sp = sstart;
 +static size_t
 +ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
-+                   const seqDef* sequences, size_t nbSeq,
-+                         size_t litSize, int lastSequence)
++                   const seqDef* sequences, size_t nbSeqs,
++                         size_t litSize, int lastSubBlock)
 +{
-     const seqDef* const sstart = sequences;
-     const seqDef* const send = sequences + nbSeq;
-     const seqDef* sp = sstart;
+     size_t matchLengthSum = 0;
+     size_t litLengthSum = 0;
+-    (void)(litLengthSum); /* suppress unused variable warning on some environments */
+-    while (send-sp > 0) {
+-        ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
++    size_t n;
++    for (n=0; n<nbSeqs; n++) {
++        const ZSTD_sequenceLength seqLen = ZSTD_getSequenceLength(seqStore, sequences+n);
+         litLengthSum += seqLen.litLength;
+         matchLengthSum += seqLen.matchLength;
+-        sp++;
+     }
+-    assert(litLengthSum <= litSize);
+-    if (!lastSequence) {
++    DEBUGLOG(5, "ZSTD_seqDecompressedSize: %u sequences from %p: %u literals + %u matchlength",
++                (unsigned)nbSeqs, (const void*)sequences,
++                (unsigned)litLengthSum, (unsigned)matchLengthSum);
++    if (!lastSubBlock)
+         assert(litLengthSum == litSize);
+-    }
++    else
++        assert(litLengthSum <= litSize);
++    (void)litLengthSum;
+     return matchLengthSum + litSize;
+ }
+ 
 @@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*
   *  @return : compressed size of sequences section of a sub-block
   *            Or 0 if it is unable to compress
@@ -17415,7 +24030,408 @@ index 17d836cc84e8..dbacbaf72733 100644
  {
      const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
      BYTE* const ostart = (BYTE*)dst;
-@@ -539,7 +544,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+@@ -176,14 +181,14 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+     /* Sequences Header */
+     RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+                     dstSize_tooSmall, "");
+-    if (nbSeq < 0x7F)
++    if (nbSeq < 128)
+         *op++ = (BYTE)nbSeq;
+     else if (nbSeq < LONGNBSEQ)
+         op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+     else
+         op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+     if (nbSeq==0) {
+-        return op - ostart;
++        return (size_t)(op - ostart);
+     }
+ 
+     /* seqHead : flags for FSE encoding type */
+@@ -205,7 +210,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+     }
+ 
+     {   size_t const bitstreamSize = ZSTD_encodeSequences(
+-                                        op, oend - op,
++                                        op, (size_t)(oend - op),
+                                         fseTables->matchlengthCTable, mlCode,
+                                         fseTables->offcodeCTable, ofCode,
+                                         fseTables->litlengthCTable, llCode,
+@@ -249,7 +254,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
+ #endif
+ 
+     *entropyWritten = 1;
+-    return op - ostart;
++    return (size_t)(op - ostart);
+ }
+ 
+ /* ZSTD_compressSubBlock() :
+@@ -275,7 +280,8 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                 litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
+     {   size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
+                                                         &entropyMetadata->hufMetadata, literals, litSize,
+-                                                        op, oend-op, bmi2, writeLitEntropy, litEntropyWritten);
++                                                        op, (size_t)(oend-op),
++                                                        bmi2, writeLitEntropy, litEntropyWritten);
+         FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed");
+         if (cLitSize == 0) return 0;
+         op += cLitSize;
+@@ -285,18 +291,18 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                                                   sequences, nbSeq,
+                                                   llCode, mlCode, ofCode,
+                                                   cctxParams,
+-                                                  op, oend-op,
++                                                  op, (size_t)(oend-op),
+                                                   bmi2, writeSeqEntropy, seqEntropyWritten);
+         FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed");
+         if (cSeqSize == 0) return 0;
+         op += cSeqSize;
+     }
+     /* Write block header */
+-    {   size_t cSize = (op-ostart)-ZSTD_blockHeaderSize;
++    {   size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize;
+         U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+         MEM_writeLE24(ostart, cBlockHeader24);
+     }
+-    return op-ostart;
++    return (size_t)(op-ostart);
+ }
+ 
+ static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
+@@ -385,7 +391,11 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,
+     return cSeqSizeEstimate + sequencesSectionHeaderSize;
+ }
+ 
+-static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
++typedef struct {
++    size_t estLitSize;
++    size_t estBlockSize;
++} EstimatedBlockSize;
++static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                         const BYTE* ofCodeTable,
+                                         const BYTE* llCodeTable,
+                                         const BYTE* mlCodeTable,
+@@ -393,15 +403,17 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                         const ZSTD_entropyCTables_t* entropy,
+                                         const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                         void* workspace, size_t wkspSize,
+-                                        int writeLitEntropy, int writeSeqEntropy) {
+-    size_t cSizeEstimate = 0;
+-    cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
+-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+-                                                         workspace, wkspSize, writeLitEntropy);
+-    cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
++                                        int writeLitEntropy, int writeSeqEntropy)
++{
++    EstimatedBlockSize ebs;
++    ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize,
++                                                        &entropy->huf, &entropyMetadata->hufMetadata,
++                                                        workspace, wkspSize, writeLitEntropy);
++    ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+                                                          nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                                          workspace, wkspSize, writeSeqEntropy);
+-    return cSizeEstimate + ZSTD_blockHeaderSize;
++    ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize;
++    return ebs;
+ }
+ 
+ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata)
+@@ -415,13 +427,56 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe
+     return 0;
+ }
+ 
++static size_t countLiterals(seqStore_t const* seqStore, const seqDef* sp, size_t seqCount)
++{
++    size_t n, total = 0;
++    assert(sp != NULL);
++    for (n=0; n<seqCount; n++) {
++        total += ZSTD_getSequenceLength(seqStore, sp+n).litLength;
++    }
++    DEBUGLOG(6, "countLiterals for %zu sequences from %p => %zu bytes", seqCount, (const void*)sp, total);
++    return total;
++}
++
++#define BYTESCALE 256
++
++static size_t sizeBlockSequences(const seqDef* sp, size_t nbSeqs,
++                size_t targetBudget, size_t avgLitCost, size_t avgSeqCost,
++                int firstSubBlock)
++{
++    size_t n, budget = 0, inSize=0;
++    /* entropy headers */
++    size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */
++    assert(firstSubBlock==0 || firstSubBlock==1);
++    budget += headerSize;
++
++    /* first sequence => at least one sequence*/
++    budget += sp[0].litLength * avgLitCost + avgSeqCost;
++    if (budget > targetBudget) return 1;
++    inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH);
++
++    /* loop over sequences */
++    for (n=1; n<nbSeqs; n++) {
++        size_t currentCost = sp[n].litLength * avgLitCost + avgSeqCost;
++        budget += currentCost;
++        inSize += sp[n].litLength + (sp[n].mlBase+MINMATCH);
++        /* stop when sub-block budget is reached */
++        if ( (budget > targetBudget)
++            /* though continue to expand until the sub-block is deemed compressible */
++          && (budget < inSize * BYTESCALE) )
++            break;
++    }
++
++    return n;
++}
++
+ /* ZSTD_compressSubBlock_multi() :
+  *  Breaks super-block into multiple sub-blocks and compresses them.
+- *  Entropy will be written to the first block.
+- *  The following blocks will use repeat mode to compress.
+- *  All sub-blocks are compressed blocks (no raw or rle blocks).
+- *  @return : compressed size of the super block (which is multiple ZSTD blocks)
+- *            Or 0 if it failed to compress. */
++ *  Entropy will be written into the first block.
++ *  The following blocks use repeat_mode to compress.
++ *  Sub-blocks are all compressed, except the last one when beneficial.
++ *  @return : compressed size of the super block (which features multiple ZSTD blocks)
++ *            or 0 if it failed to compress. */
+ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+                             const ZSTD_compressedBlockState_t* prevCBlock,
+                             ZSTD_compressedBlockState_t* nextCBlock,
+@@ -434,10 +489,12 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+ {
+     const seqDef* const sstart = seqStorePtr->sequencesStart;
+     const seqDef* const send = seqStorePtr->sequences;
+-    const seqDef* sp = sstart;
++    const seqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */
++    size_t const nbSeqs = (size_t)(send - sstart);
+     const BYTE* const lstart = seqStorePtr->litStart;
+     const BYTE* const lend = seqStorePtr->lit;
+     const BYTE* lp = lstart;
++    size_t const nbLiterals = (size_t)(lend - lstart);
+     BYTE const* ip = (BYTE const*)src;
+     BYTE const* const iend = ip + srcSize;
+     BYTE* const ostart = (BYTE*)dst;
+@@ -446,112 +503,171 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+     const BYTE* llCodePtr = seqStorePtr->llCode;
+     const BYTE* mlCodePtr = seqStorePtr->mlCode;
+     const BYTE* ofCodePtr = seqStorePtr->ofCode;
+-    size_t targetCBlockSize = cctxParams->targetCBlockSize;
+-    size_t litSize, seqCount;
+-    int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
++    size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */
++    size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize);
++    int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed);
+     int writeSeqEntropy = 1;
+-    int lastSequence = 0;
+-
+-    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
+-                (unsigned)(lend-lp), (unsigned)(send-sstart));
+-
+-    litSize = 0;
+-    seqCount = 0;
+-    do {
+-        size_t cBlockSizeEstimate = 0;
+-        if (sstart == send) {
+-            lastSequence = 1;
+-        } else {
+-            const seqDef* const sequence = sp + seqCount;
+-            lastSequence = sequence == send - 1;
+-            litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
+-            seqCount++;
+-        }
+-        if (lastSequence) {
+-            assert(lp <= lend);
+-            assert(litSize <= (size_t)(lend - lp));
+-            litSize = (size_t)(lend - lp);
++
++    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)",
++               (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart));
++
++        /* let's start by a general estimation for the full block */
++    if (nbSeqs > 0) {
++        EstimatedBlockSize const ebs =
++                ZSTD_estimateSubBlockSize(lp, nbLiterals,
++                                        ofCodePtr, llCodePtr, mlCodePtr, nbSeqs,
++                                        &nextCBlock->entropy, entropyMetadata,
++                                        workspace, wkspSize,
++                                        writeLitEntropy, writeSeqEntropy);
++        /* quick estimation */
++        size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE;
++        size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs;
++        const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1);
++        size_t n, avgBlockBudget, blockBudgetSupp=0;
++        avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks;
++        DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes",
++                    (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE,
++                    (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE);
++        /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately
++         * this will result in the production of a single uncompressed block covering @srcSize.*/
++        if (ebs.estBlockSize > srcSize) return 0;
++
++        /* compress and write sub-blocks */
++        assert(nbSubBlocks>0);
++        for (n=0; n < nbSubBlocks-1; n++) {
++            /* determine nb of sequences for current sub-block + nbLiterals from next sequence */
++            size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp),
++                                        avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0);
++            /* if reached last sequence : break to last sub-block (simplification) */
++            assert(seqCount <= (size_t)(send-sp));
++            if (sp + seqCount == send) break;
++            assert(seqCount > 0);
++            /* compress sub-block */
++            {   int litEntropyWritten = 0;
++                int seqEntropyWritten = 0;
++                size_t litSize = countLiterals(seqStorePtr, sp, seqCount);
++                const size_t decompressedSize =
++                        ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0);
++                size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
++                                                sp, seqCount,
++                                                lp, litSize,
++                                                llCodePtr, mlCodePtr, ofCodePtr,
++                                                cctxParams,
++                                                op, (size_t)(oend-op),
++                                                bmi2, writeLitEntropy, writeSeqEntropy,
++                                                &litEntropyWritten, &seqEntropyWritten,
++                                                0);
++                FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
++
++                /* check compressibility, update state components */
++                if (cSize > 0 && cSize < decompressedSize) {
++                    DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes",
++                                (unsigned)decompressedSize, (unsigned)cSize);
++                    assert(ip + decompressedSize <= iend);
++                    ip += decompressedSize;
++                    lp += litSize;
++                    op += cSize;
++                    llCodePtr += seqCount;
++                    mlCodePtr += seqCount;
++                    ofCodePtr += seqCount;
++                    /* Entropy only needs to be written once */
++                    if (litEntropyWritten) {
++                        writeLitEntropy = 0;
++                    }
++                    if (seqEntropyWritten) {
++                        writeSeqEntropy = 0;
++                    }
++                    sp += seqCount;
++                    blockBudgetSupp = 0;
++            }   }
++            /* otherwise : do not compress yet, coalesce current sub-block with following one */
+         }
+-        /* I think there is an optimization opportunity here.
+-         * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
+-         * since it recalculates estimate from scratch.
+-         * For example, it would recount literal distribution and symbol codes every time.
+-         */
+-        cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
+-                                                       &nextCBlock->entropy, entropyMetadata,
+-                                                       workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
+-        if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
+-            int litEntropyWritten = 0;
+-            int seqEntropyWritten = 0;
+-            const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
+-            const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
+-                                                       sp, seqCount,
+-                                                       lp, litSize,
+-                                                       llCodePtr, mlCodePtr, ofCodePtr,
+-                                                       cctxParams,
+-                                                       op, oend-op,
+-                                                       bmi2, writeLitEntropy, writeSeqEntropy,
+-                                                       &litEntropyWritten, &seqEntropyWritten,
+-                                                       lastBlock && lastSequence);
+-            FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
+-            if (cSize > 0 && cSize < decompressedSize) {
+-                DEBUGLOG(5, "Committed the sub-block");
+-                assert(ip + decompressedSize <= iend);
+-                ip += decompressedSize;
+-                sp += seqCount;
+-                lp += litSize;
+-                op += cSize;
+-                llCodePtr += seqCount;
+-                mlCodePtr += seqCount;
+-                ofCodePtr += seqCount;
+-                litSize = 0;
+-                seqCount = 0;
+-                /* Entropy only needs to be written once */
+-                if (litEntropyWritten) {
+-                    writeLitEntropy = 0;
+-                }
+-                if (seqEntropyWritten) {
+-                    writeSeqEntropy = 0;
+-                }
++    } /* if (nbSeqs > 0) */
++
++    /* write last block */
++    DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp));
++    {   int litEntropyWritten = 0;
++        int seqEntropyWritten = 0;
++        size_t litSize = (size_t)(lend - lp);
++        size_t seqCount = (size_t)(send - sp);
++        const size_t decompressedSize =
++                ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1);
++        size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
++                                            sp, seqCount,
++                                            lp, litSize,
++                                            llCodePtr, mlCodePtr, ofCodePtr,
++                                            cctxParams,
++                                            op, (size_t)(oend-op),
++                                            bmi2, writeLitEntropy, writeSeqEntropy,
++                                            &litEntropyWritten, &seqEntropyWritten,
++                                            lastBlock);
++        FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
++
++        /* update pointers, the nb of literals borrowed from next sequence must be preserved */
++        if (cSize > 0 && cSize < decompressedSize) {
++            DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes",
++                        (unsigned)decompressedSize, (unsigned)cSize);
++            assert(ip + decompressedSize <= iend);
++            ip += decompressedSize;
++            lp += litSize;
++            op += cSize;
++            llCodePtr += seqCount;
++            mlCodePtr += seqCount;
++            ofCodePtr += seqCount;
++            /* Entropy only needs to be written once */
++            if (litEntropyWritten) {
++                writeLitEntropy = 0;
+             }
++            if (seqEntropyWritten) {
++                writeSeqEntropy = 0;
++            }
++            sp += seqCount;
+         }
+-    } while (!lastSequence);
++    }
++
++
+     if (writeLitEntropy) {
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
++        DEBUGLOG(5, "Literal entropy tables were never written");
+         ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
+     }
+     if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
+         /* If we haven't written our entropy tables, then we've violated our contract and
+          * must emit an uncompressed block.
+          */
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
++        DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block");
+         return 0;
+     }
++
+     if (ip < iend) {
+-        size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
+-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
++        /* some data left : last part of the block sent uncompressed */
++        size_t const rSize = (size_t)((iend - ip));
++        size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock);
++        DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize));
+         FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+         assert(cSize != 0);
+         op += cSize;
+         /* We have to regenerate the repcodes because we've skipped some sequences */
+         if (sp < send) {
+-            seqDef const* seq;
++            const seqDef* seq;
              repcodes_t rep;
              ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
              for (seq = sstart; seq < sp; ++seq) {
@@ -17424,6 +24440,25 @@ index 17d836cc84e8..dbacbaf72733 100644
              }
              ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
          }
+     }
+-    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
+-    return op-ostart;
++
++    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u",
++                (unsigned)(op-ostart));
++    return (size_t)(op-ostart);
+ }
+ 
+ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                                void* dst, size_t dstCapacity,
+-                               void const* src, size_t srcSize,
+-                               unsigned lastBlock) {
++                               const void* src, size_t srcSize,
++                               unsigned lastBlock)
++{
+     ZSTD_entropyCTablesMetadata_t entropyMetadata;
+ 
+     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore,
 diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h
 index 224ece79546e..826bbc9e029b 100644
 --- a/lib/zstd/compress/zstd_compress_superblock.h
@@ -17437,7 +24472,7 @@ index 224ece79546e..826bbc9e029b 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
-index 349fc923c355..65ea53b62844 100644
+index 349fc923c355..86bc3c2c23c7 100644
 --- a/lib/zstd/compress/zstd_cwksp.h
 +++ b/lib/zstd/compress/zstd_cwksp.h
 @@ -1,5 +1,6 @@
@@ -17666,7 +24701,13 @@ index 349fc923c355..65ea53b62844 100644
  /*
   * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
   */
-@@ -361,13 +401,17 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
+@@ -356,18 +396,22 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
+ 
+ /*
+  * Aligned on 64 bytes. These buffers have the special property that
+- * their values remain constrained, allowing us to re-use them without
++ * their values remain constrained, allowing us to reuse them without
+  * memset()-ing them.
   */
  MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
  {
@@ -17696,7 +24737,7 @@ index 349fc923c355..65ea53b62844 100644
      }
      ZSTD_cwksp_mark_tables_clean(ws);
  }
-@@ -478,10 +522,10 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
+@@ -478,14 +522,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
  
  
      ws->tableEnd = ws->objectEnd;
@@ -17710,7 +24751,20 @@ index 349fc923c355..65ea53b62844 100644
      }
      ZSTD_cwksp_assert_internal_consistency(ws);
  }
-@@ -498,6 +542,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c
+ 
++MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
++    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
++}
++
++MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
++    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
++         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
++}
++
+ /*
+  * The provided workspace takes ownership of the buffer [start, start+size).
+  * Any existing values in the workspace are ignored (the previously managed
+@@ -498,6 +551,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c
      ws->workspaceEnd = (BYTE*)start + size;
      ws->objectEnd = ws->workspace;
      ws->tableValidEnd = ws->objectEnd;
@@ -17718,6 +24772,22 @@ index 349fc923c355..65ea53b62844 100644
      ws->phase = ZSTD_cwksp_alloc_objects;
      ws->isStatic = isStatic;
      ZSTD_cwksp_clear(ws);
+@@ -529,15 +583,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) {
+     ZSTD_memset(src, 0, sizeof(ZSTD_cwksp));
+ }
+ 
+-MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
+-    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
+-}
+-
+-MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
+-    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
+-         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
+-}
+-
+ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+     return ws->allocFailed;
+ }
 @@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
   * Returns if the estimated space needed for a wksp is within an acceptable limit of the
   * actual amount of space used.
@@ -17742,7 +24812,7 @@ index 349fc923c355..65ea53b62844 100644
  
  
 diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
-index 76933dea2624..ab9440a99603 100644
+index 76933dea2624..5ff54f17d92f 100644
 --- a/lib/zstd/compress/zstd_double_fast.c
 +++ b/lib/zstd/compress/zstd_double_fast.c
 @@ -1,5 +1,6 @@
@@ -17753,11 +24823,16 @@ index 76933dea2624..ab9440a99603 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
-@@ -11,8 +12,43 @@
+@@ -11,8 +12,49 @@
  #include "zstd_compress_internal.h"
  #include "zstd_double_fast.h"
  
-+static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+ 
+-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
 +                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
 +{
 +    const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -17770,8 +24845,7 @@ index 76933dea2624..ab9440a99603 100644
 +    const BYTE* ip = base + ms->nextToUpdate;
 +    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
 +    const U32 fastHashFillStep = 3;
- 
--void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
++
 +    /* Always insert every fastHashFillStep position into the hash tables.
 +     * Insert the other positions into the large hash table if their entry
 +     * is empty.
@@ -17794,11 +24868,13 @@ index 76933dea2624..ab9440a99603 100644
 +    }   }
 +}
 +
-+static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
                                void const* end, ZSTD_dictTableLoadMethod_e dtlm)
  {
      const ZSTD_compressionParameters* const cParams = &ms->cParams;
-@@ -43,7 +79,19 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+@@ -43,11 +85,24 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
              /* Only load extra positions for ZSTD_dtlm_full */
              if (dtlm == ZSTD_dtlm_fast)
                  break;
@@ -17819,7 +24895,12 @@ index 76933dea2624..ab9440a99603 100644
  }
  
  
-@@ -67,7 +115,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls /* template */)
+@@ -67,7 +122,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
      const BYTE* const iend = istart + srcSize;
      const BYTE* const ilimit = iend - HASH_READ_SIZE;
      U32 offset_1=rep[0], offset_2=rep[1];
@@ -17828,7 +24909,7 @@ index 76933dea2624..ab9440a99603 100644
  
      size_t mLength;
      U32 offset;
-@@ -100,8 +148,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+@@ -100,8 +155,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
          U32 const current = (U32)(ip - base);
          U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
          U32 const maxRep = current - windowLow;
@@ -17839,7 +24920,7 @@ index 76933dea2624..ab9440a99603 100644
      }
  
      /* Outer Loop: one iteration per match found and stored */
-@@ -131,7 +179,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+@@ -131,7 +186,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
              if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
                  mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
                  ip++;
@@ -17848,7 +24929,7 @@ index 76933dea2624..ab9440a99603 100644
                  goto _match_stored;
              }
  
-@@ -175,9 +223,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+@@ -175,9 +230,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
          } while (ip1 <= ilimit);
  
  _cleanup:
@@ -17864,7 +24945,7 @@ index 76933dea2624..ab9440a99603 100644
  
          /* Return the last literals size */
          return (size_t)(iend - anchor);
-@@ -217,7 +269,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+@@ -217,7 +276,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
              hashLong[hl1] = (U32)(ip1 - base);
          }
  
@@ -17873,7 +24954,7 @@ index 76933dea2624..ab9440a99603 100644
  
  _match_stored:
          /* match found */
-@@ -243,7 +295,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+@@ -243,7 +302,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
                  U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
                  hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
                  hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
@@ -17882,7 +24963,15 @@ index 76933dea2624..ab9440a99603 100644
                  ip += rLength;
                  anchor = ip;
                  continue;   /* faster when present ... (?) */
-@@ -275,7 +327,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+@@ -254,6 +313,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+ 
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+@@ -275,7 +335,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
      const BYTE* const iend = istart + srcSize;
      const BYTE* const ilimit = iend - HASH_READ_SIZE;
      U32 offset_1=rep[0], offset_2=rep[1];
@@ -17890,7 +24979,7 @@ index 76933dea2624..ab9440a99603 100644
  
      const ZSTD_matchState_t* const dms = ms->dictMatchState;
      const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
-@@ -286,8 +337,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+@@ -286,8 +345,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
      const BYTE* const dictStart    = dictBase + dictStartIndex;
      const BYTE* const dictEnd      = dms->window.nextSrc;
      const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
@@ -17901,21 +24990,21 @@ index 76933dea2624..ab9440a99603 100644
      const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
  
      DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
-@@ -295,6 +346,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+@@ -295,6 +354,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
      /* if a dictionary is attached, it must be within window range */
      assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
  
 +    if (ms->prefetchCDictTables) {
 +        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
 +        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
-+        PREFETCH_AREA(dictHashLong, hashTableBytes)
-+        PREFETCH_AREA(dictHashSmall, chainTableBytes)
++        PREFETCH_AREA(dictHashLong, hashTableBytes);
++        PREFETCH_AREA(dictHashSmall, chainTableBytes);
 +    }
 +
      /* init */
      ip += (dictAndPrefixLength == 0);
  
-@@ -309,8 +367,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+@@ -309,8 +375,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
          U32 offset;
          size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
          size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
@@ -17930,7 +25019,7 @@ index 76933dea2624..ab9440a99603 100644
          U32 const curr = (U32)(ip-base);
          U32 const matchIndexL = hashLong[h2];
          U32 matchIndexS = hashSmall[h];
-@@ -328,7 +390,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+@@ -328,7 +398,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
              const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
              mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
              ip++;
@@ -17939,7 +25028,7 @@ index 76933dea2624..ab9440a99603 100644
              goto _match_stored;
          }
  
-@@ -340,9 +402,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+@@ -340,9 +410,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                  while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
                  goto _match_found;
              }
@@ -17951,7 +25040,7 @@ index 76933dea2624..ab9440a99603 100644
              const BYTE* dictMatchL = dictBase + dictMatchIndexL;
              assert(dictMatchL < dictEnd);
  
-@@ -358,9 +420,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+@@ -358,9 +428,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
              if (MEM_read32(match) == MEM_read32(ip)) {
                  goto _search_next_long;
              }
@@ -17963,7 +25052,7 @@ index 76933dea2624..ab9440a99603 100644
              match = dictBase + dictMatchIndexS;
              matchIndexS = dictMatchIndexS + dictIndexDelta;
  
-@@ -375,10 +437,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+@@ -375,10 +445,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
          continue;
  
  _search_next_long:
@@ -17977,7 +25066,7 @@ index 76933dea2624..ab9440a99603 100644
              const BYTE* matchL3 = base + matchIndexL3;
              hashLong[hl3] = curr + 1;
  
-@@ -391,9 +454,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+@@ -391,9 +462,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                      while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
                      goto _match_found;
                  }
@@ -17989,7 +25078,7 @@ index 76933dea2624..ab9440a99603 100644
                  const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
                  assert(dictMatchL3 < dictEnd);
                  if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
-@@ -419,7 +482,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+@@ -419,7 +490,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
          offset_2 = offset_1;
          offset_1 = offset;
  
@@ -17998,7 +25087,7 @@ index 76933dea2624..ab9440a99603 100644
  
  _match_stored:
          /* match found */
-@@ -448,7 +511,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+@@ -448,7 +519,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                      const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
                      size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
                      U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
@@ -18007,7 +25096,7 @@ index 76933dea2624..ab9440a99603 100644
                      hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
                      hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
                      ip += repLength2;
-@@ -461,8 +524,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
+@@ -461,8 +532,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
      }   /* while (ip < ilimit) */
  
      /* save reps for next block */
@@ -18018,7 +25107,18 @@ index 76933dea2624..ab9440a99603 100644
  
      /* Return the last literals size */
      return (size_t)(iend - anchor);
-@@ -585,7 +648,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+@@ -527,7 +598,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+ }
+ 
+ 
+-static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+         U32 const mls /* template */)
+@@ -585,7 +658,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
              const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
              mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
              ip++;
@@ -18027,7 +25127,7 @@ index 76933dea2624..ab9440a99603 100644
          } else {
              if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
                  const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
-@@ -596,7 +659,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+@@ -596,7 +669,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                  while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
                  offset_2 = offset_1;
                  offset_1 = offset;
@@ -18036,7 +25136,7 @@ index 76933dea2624..ab9440a99603 100644
  
              } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
                  size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
-@@ -621,7 +684,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+@@ -621,7 +694,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                  }
                  offset_2 = offset_1;
                  offset_1 = offset;
@@ -18045,7 +25145,7 @@ index 76933dea2624..ab9440a99603 100644
  
              } else {
                  ip += ((ip-anchor) >> kSearchStrength) + 1;
-@@ -653,7 +716,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+@@ -653,7 +726,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                      const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
                      size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
                      U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
@@ -18054,8 +25154,14 @@ index 76933dea2624..ab9440a99603 100644
                      hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
                      hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
                      ip += repLength2;
+@@ -694,3 +767,5 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
+         return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
+     }
+ }
++
++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
 diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h
-index 6822bde65a1d..0204f12e4cf7 100644
+index 6822bde65a1d..b7ddc714f13e 100644
 --- a/lib/zstd/compress/zstd_double_fast.h
 +++ b/lib/zstd/compress/zstd_double_fast.h
 @@ -1,5 +1,6 @@
@@ -18066,18 +25172,37 @@ index 6822bde65a1d..0204f12e4cf7 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
-@@ -16,7 +17,8 @@
+@@ -15,8 +16,12 @@
+ #include "../common/mem.h"      /* U32 */
  #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
  
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
++
  void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
 -                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
 +                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
 +                              ZSTD_tableFillPurpose_e tfp);
++
  size_t ZSTD_compressBlock_doubleFast(
          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
          void const* src, size_t srcSize);
+@@ -27,6 +32,14 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL
++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
+ 
+ 
+ #endif /* ZSTD_DOUBLE_FAST_H */
 diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
-index a752e6beab52..3399b39c5dbc 100644
+index a752e6beab52..b7a63ba4ce56 100644
 --- a/lib/zstd/compress/zstd_fast.c
 +++ b/lib/zstd/compress/zstd_fast.c
 @@ -1,5 +1,6 @@
@@ -18088,11 +25213,13 @@ index a752e6beab52..3399b39c5dbc 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
-@@ -11,8 +12,42 @@
+@@ -11,8 +12,46 @@
  #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
  #include "zstd_fast.h"
  
-+static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
 +                        const void* const end,
 +                        ZSTD_dictTableLoadMethod_e dtlm)
 +{
@@ -18128,11 +25255,13 @@ index a752e6beab52..3399b39c5dbc 100644
 +                }   }   }   }
 +}
 +
-+static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
                          const void* const end,
                          ZSTD_dictTableLoadMethod_e dtlm)
  {
-@@ -25,6 +60,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+@@ -25,6 +64,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
      const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
      const U32 fastHashFillStep = 3;
  
@@ -18143,7 +25272,7 @@ index a752e6beab52..3399b39c5dbc 100644
      /* Always insert every fastHashFillStep position into the hash table.
       * Insert the other positions if their hash entry is empty.
       */
-@@ -42,6 +81,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+@@ -42,6 +85,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
      }   }   }   }
  }
  
@@ -18162,7 +25291,19 @@ index a752e6beab52..3399b39c5dbc 100644
  
  /*
   * If you squint hard enough (and ignore repcodes), the search operation at any
-@@ -117,7 +168,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+@@ -89,8 +144,9 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+  *
+  * This is also the work we do at the beginning to enter the loop initially.
+  */
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_compressBlock_fast_noDict_generic(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_fast_noDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize,
+         U32 const mls, U32 const hasStep)
+@@ -117,7 +173,7 @@ ZSTD_compressBlock_fast_noDict_generic(
  
      U32 rep_offset1 = rep[0];
      U32 rep_offset2 = rep[1];
@@ -18171,7 +25312,7 @@ index a752e6beab52..3399b39c5dbc 100644
  
      size_t hash0; /* hash for ip0 */
      size_t hash1; /* hash for ip1 */
-@@ -141,8 +192,8 @@ ZSTD_compressBlock_fast_noDict_generic(
+@@ -141,8 +197,8 @@ ZSTD_compressBlock_fast_noDict_generic(
      {   U32 const curr = (U32)(ip0 - base);
          U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
          U32 const maxRep = curr - windowLow;
@@ -18182,7 +25323,7 @@ index a752e6beab52..3399b39c5dbc 100644
      }
  
      /* start each op */
-@@ -180,8 +231,14 @@ ZSTD_compressBlock_fast_noDict_generic(
+@@ -180,8 +236,14 @@ ZSTD_compressBlock_fast_noDict_generic(
              mLength = ip0[-1] == match0[-1];
              ip0 -= mLength;
              match0 -= mLength;
@@ -18198,7 +25339,7 @@ index a752e6beab52..3399b39c5dbc 100644
              goto _match;
          }
  
-@@ -195,6 +252,12 @@ ZSTD_compressBlock_fast_noDict_generic(
+@@ -195,6 +257,12 @@ ZSTD_compressBlock_fast_noDict_generic(
          /* check match at ip[0] */
          if (MEM_read32(ip0) == mval) {
              /* found a match! */
@@ -18211,7 +25352,7 @@ index a752e6beab52..3399b39c5dbc 100644
              goto _offset;
          }
  
-@@ -224,6 +287,21 @@ ZSTD_compressBlock_fast_noDict_generic(
+@@ -224,6 +292,21 @@ ZSTD_compressBlock_fast_noDict_generic(
          /* check match at ip[0] */
          if (MEM_read32(ip0) == mval) {
              /* found a match! */
@@ -18233,7 +25374,7 @@ index a752e6beab52..3399b39c5dbc 100644
              goto _offset;
          }
  
-@@ -254,9 +332,24 @@ ZSTD_compressBlock_fast_noDict_generic(
+@@ -254,9 +337,24 @@ ZSTD_compressBlock_fast_noDict_generic(
       * However, it seems to be a meaningful performance hit to try to search
       * them. So let's not. */
  
@@ -18260,7 +25401,7 @@ index a752e6beab52..3399b39c5dbc 100644
  
      /* Return the last literals size */
      return (size_t)(iend - anchor);
-@@ -267,7 +360,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+@@ -267,7 +365,7 @@ ZSTD_compressBlock_fast_noDict_generic(
      match0 = base + idx;
      rep_offset2 = rep_offset1;
      rep_offset1 = (U32)(ip0-match0);
@@ -18269,7 +25410,7 @@ index a752e6beab52..3399b39c5dbc 100644
      mLength = 4;
  
      /* Count the backwards match length. */
-@@ -287,11 +380,6 @@ ZSTD_compressBlock_fast_noDict_generic(
+@@ -287,11 +385,6 @@ ZSTD_compressBlock_fast_noDict_generic(
      ip0 += mLength;
      anchor = ip0;
  
@@ -18281,7 +25422,7 @@ index a752e6beab52..3399b39c5dbc 100644
      /* Fill table and check for immediate repcode. */
      if (ip0 <= ilimit) {
          /* Fill Table */
-@@ -306,7 +394,7 @@ ZSTD_compressBlock_fast_noDict_generic(
+@@ -306,7 +399,7 @@ ZSTD_compressBlock_fast_noDict_generic(
                  { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
                  hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
                  ip0 += rLength;
@@ -18290,7 +25431,15 @@ index a752e6beab52..3399b39c5dbc 100644
                  anchor = ip0;
                  continue;   /* faster when present (confirmed on gcc-8) ... (?) */
      }   }   }
-@@ -380,14 +468,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+@@ -369,6 +462,7 @@ size_t ZSTD_compressBlock_fast(
+ }
+ 
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+@@ -380,14 +474,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
      U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
      const BYTE* const base = ms->window.base;
      const BYTE* const istart = (const BYTE*)src;
@@ -18307,7 +25456,7 @@ index a752e6beab52..3399b39c5dbc 100644
  
      const ZSTD_matchState_t* const dms = ms->dictMatchState;
      const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
-@@ -397,13 +485,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+@@ -397,13 +491,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
      const BYTE* const dictStart    = dictBase + dictStartIndex;
      const BYTE* const dictEnd      = dms->window.nextSrc;
      const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
@@ -18324,13 +25473,13 @@ index a752e6beab52..3399b39c5dbc 100644
      assert(endIndex - prefixStartIndex <= maxDistance);
      (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
  
-@@ -413,106 +501,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+@@ -413,106 +507,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
       * when translating a dict index into a local index */
      assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
  
 +    if (ms->prefetchCDictTables) {
 +        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
-+        PREFETCH_AREA(dictHashTable, hashTableBytes)
++        PREFETCH_AREA(dictHashTable, hashTableBytes);
 +    }
 +
      /* init */
@@ -18545,7 +25694,18 @@ index a752e6beab52..3399b39c5dbc 100644
  
      /* Return the last literals size */
      return (size_t)(iend - anchor);
-@@ -553,11 +690,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+@@ -545,7 +688,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState(
+ }
+ 
+ 
+-static size_t ZSTD_compressBlock_fast_extDict_generic(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_fast_extDict_generic(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
+ {
+@@ -553,11 +698,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
      U32* const hashTable = ms->hashTable;
      U32 const hlog = cParams->hashLog;
      /* support stepSize of 0 */
@@ -18558,7 +25718,7 @@ index a752e6beab52..3399b39c5dbc 100644
      const BYTE* anchor = istart;
      const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
      const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
-@@ -570,6 +706,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+@@ -570,6 +714,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
      const BYTE* const iend = istart + srcSize;
      const BYTE* const ilimit = iend - 8;
      U32 offset_1=rep[0], offset_2=rep[1];
@@ -18587,7 +25747,7 @@ index a752e6beab52..3399b39c5dbc 100644
  
      (void)hasStep; /* not currently specialized on whether it's accelerated */
  
-@@ -579,75 +737,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
+@@ -579,75 +745,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
      if (prefixStartIndex == dictStartIndex)
          return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
  
@@ -18851,7 +26011,7 @@ index a752e6beab52..3399b39c5dbc 100644
  }
  
  ZSTD_GEN_FAST_FN(extDict, 4, 0)
-@@ -660,6 +945,7 @@ size_t ZSTD_compressBlock_fast_extDict(
+@@ -660,6 +953,7 @@ size_t ZSTD_compressBlock_fast_extDict(
          void const* src, size_t srcSize)
  {
      U32 const mls = ms->cParams.minMatch;
@@ -18882,7 +26042,7 @@ index fddc2f532d21..e64d9e1b2d39 100644
          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
          void const* src, size_t srcSize);
 diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
-index 0298a01a7504..f6b4978ceba7 100644
+index 0298a01a7504..3e88d8a1a136 100644
 --- a/lib/zstd/compress/zstd_lazy.c
 +++ b/lib/zstd/compress/zstd_lazy.c
 @@ -1,5 +1,6 @@
@@ -18893,17 +26053,57 @@ index 0298a01a7504..f6b4978ceba7 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
-@@ -10,6 +11,9 @@
+@@ -10,14 +11,23 @@
  
  #include "zstd_compress_internal.h"
  #include "zstd_lazy.h"
 +#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
 +
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
++
 +#define kLazySkippingStep 8
  
  
  /*-*************************************
-@@ -197,8 +201,8 @@ ZSTD_DUBT_findBetterDictMatch (
+ *  Binary Tree search
+ ***************************************/
+ 
+-static void
+-ZSTD_updateDUBT(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+                 const BYTE* ip, const BYTE* iend,
+                 U32 mls)
+ {
+@@ -60,8 +70,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+  *  sort one already inserted but unsorted position
+  *  assumption : curr >= btlow == (curr - btmask)
+  *  doesn't fail */
+-static void
+-ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
+                  U32 curr, const BYTE* inputEnd,
+                  U32 nbCompares, U32 btLow,
+                  const ZSTD_dictMode_e dictMode)
+@@ -149,8 +160,9 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
+ }
+ 
+ 
+-static size_t
+-ZSTD_DUBT_findBetterDictMatch (
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_DUBT_findBetterDictMatch (
+         const ZSTD_matchState_t* ms,
+         const BYTE* const ip, const BYTE* const iend,
+         size_t* offsetPtr,
+@@ -197,8 +209,8 @@ ZSTD_DUBT_findBetterDictMatch (
              U32 matchIndex = dictMatchIndex + dictIndexDelta;
              if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
                  DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
@@ -18914,7 +26114,7 @@ index 0298a01a7504..f6b4978ceba7 100644
              }
              if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
                  break;   /* drop, to guarantee consistency (miss a little bit of compression) */
-@@ -218,7 +222,7 @@ ZSTD_DUBT_findBetterDictMatch (
+@@ -218,7 +230,7 @@ ZSTD_DUBT_findBetterDictMatch (
      }
  
      if (bestLength >= MINMATCH) {
@@ -18923,16 +26123,22 @@ index 0298a01a7504..f6b4978ceba7 100644
          DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
                      curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
      }
-@@ -230,7 +234,7 @@ ZSTD_DUBT_findBetterDictMatch (
- static size_t
- ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+@@ -227,10 +239,11 @@ ZSTD_DUBT_findBetterDictMatch (
+ }
+ 
+ 
+-static size_t
+-ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
                          const BYTE* const ip, const BYTE* const iend,
 -                        size_t* offsetPtr,
 +                        size_t* offBasePtr,
                          U32 const mls,
                          const ZSTD_dictMode_e dictMode)
  {
-@@ -327,8 +331,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+@@ -327,8 +340,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
              if (matchLength > bestLength) {
                  if (matchLength > matchEndIdx - matchIndex)
                      matchEndIdx = matchIndex + (U32)matchLength;
@@ -18943,7 +26149,7 @@ index 0298a01a7504..f6b4978ceba7 100644
                  if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
                      if (dictMode == ZSTD_dictMatchState) {
                          nbCompares = 0; /* in addition to avoiding checking any
-@@ -361,16 +365,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+@@ -361,16 +374,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
          if (dictMode == ZSTD_dictMatchState && nbCompares) {
              bestLength = ZSTD_DUBT_findBetterDictMatch(
                      ms, ip, iend,
@@ -18963,9 +26169,15 @@ index 0298a01a7504..f6b4978ceba7 100644
          }
          return bestLength;
      }
-@@ -381,14 +385,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
- FORCE_INLINE_TEMPLATE size_t
- ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+@@ -378,17 +391,18 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+ 
+ 
+ /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
                  const BYTE* const ip, const BYTE* const iLimit,
 -                      size_t* offsetPtr,
 +                      size_t* offBasePtr,
@@ -18980,7 +26192,7 @@ index 0298a01a7504..f6b4978ceba7 100644
  }
  
  /* *********************************
-@@ -561,7 +565,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+@@ -561,7 +575,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
          /* save best solution */
          if (currentMl > ml) {
              ml = currentMl;
@@ -18989,7 +26201,7 @@ index 0298a01a7504..f6b4978ceba7 100644
              if (ip+currentMl == iLimit) {
                  /* best possible, avoids read overflow on next attempt */
                  return ml;
-@@ -598,7 +602,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+@@ -598,7 +612,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
              /* save best solution */
              if (currentMl > ml) {
                  ml = currentMl;
@@ -18998,8 +26210,14 @@ index 0298a01a7504..f6b4978ceba7 100644
                  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
              }
          }
-@@ -617,7 +621,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
- FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+@@ -614,10 +628,12 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
+ 
+ /* Update chains up to ip (excluded)
+    Assumption : always within prefix (i.e. not within extDict) */
+-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertAndFindFirstIndex_internal(
                          ZSTD_matchState_t* ms,
                          const ZSTD_compressionParameters* const cParams,
 -                        const BYTE* ip, U32 const mls)
@@ -19007,7 +26225,7 @@ index 0298a01a7504..f6b4978ceba7 100644
  {
      U32* const hashTable  = ms->hashTable;
      const U32 hashLog = cParams->hashLog;
-@@ -632,6 +636,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+@@ -632,6 +648,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
          NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
          hashTable[h] = idx;
          idx++;
@@ -19017,7 +26235,7 @@ index 0298a01a7504..f6b4978ceba7 100644
      }
  
      ms->nextToUpdate = target;
-@@ -640,7 +647,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+@@ -640,11 +659,12 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
  
  U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
      const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -19026,7 +26244,12 @@ index 0298a01a7504..f6b4978ceba7 100644
  }
  
  /* inlining is important to hardwire a hot branch (template emulation) */
-@@ -684,14 +691,15 @@ size_t ZSTD_HcFindBestMatch(
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_HcFindBestMatch(
+                         ZSTD_matchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iLimit,
+@@ -684,14 +704,15 @@ size_t ZSTD_HcFindBestMatch(
      }
  
      /* HC4 match finder */
@@ -19044,7 +26267,7 @@ index 0298a01a7504..f6b4978ceba7 100644
                  currentMl = ZSTD_count(ip, match, iLimit);
          } else {
              const BYTE* const match = dictBase + matchIndex;
-@@ -703,7 +711,7 @@ size_t ZSTD_HcFindBestMatch(
+@@ -703,7 +724,7 @@ size_t ZSTD_HcFindBestMatch(
          /* save best solution */
          if (currentMl > ml) {
              ml = currentMl;
@@ -19053,7 +26276,7 @@ index 0298a01a7504..f6b4978ceba7 100644
              if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
          }
  
-@@ -739,7 +747,7 @@ size_t ZSTD_HcFindBestMatch(
+@@ -739,7 +760,7 @@ size_t ZSTD_HcFindBestMatch(
              if (currentMl > ml) {
                  ml = currentMl;
                  assert(curr > matchIndex + dmsIndexDelta);
@@ -19062,7 +26285,7 @@ index 0298a01a7504..f6b4978ceba7 100644
                  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
              }
  
-@@ -756,8 +764,6 @@ size_t ZSTD_HcFindBestMatch(
+@@ -756,8 +777,6 @@ size_t ZSTD_HcFindBestMatch(
  * (SIMD) Row-based matchfinder
  ***********************************/
  /* Constants for row-based hash */
@@ -19071,7 +26294,7 @@ index 0298a01a7504..f6b4978ceba7 100644
  #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
  #define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
  
-@@ -769,64 +775,19 @@ typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 repr
+@@ -769,64 +788,19 @@ typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 repr
   * Starting from the LSB, returns the idx of the next non-zero bit.
   * Basically counting the nb of trailing zeroes.
   */
@@ -19143,7 +26366,7 @@ index 0298a01a7504..f6b4978ceba7 100644
  }
  
  /* ZSTD_isAligned():
-@@ -840,7 +801,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
+@@ -840,7 +814,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
  /* ZSTD_row_prefetch():
   * Performs prefetching for the hashTable and tagTable at a given row.
   */
@@ -19152,7 +26375,15 @@ index 0298a01a7504..f6b4978ceba7 100644
      PREFETCH_L1(hashTable + relRow);
      if (rowLog >= 5) {
          PREFETCH_L1(hashTable + relRow + 16);
-@@ -864,13 +825,13 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
+@@ -859,18 +833,20 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* ta
+  * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
+  * but not beyond iLimit.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
+                                    U32 const rowLog, U32 const mls,
                                     U32 idx, const BYTE* const iLimit)
  {
      U32 const* const hashTable = ms->hashTable;
@@ -19168,11 +26399,15 @@ index 0298a01a7504..f6b4978ceba7 100644
          U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
          ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
          ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
-@@ -886,11 +847,12 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
+@@ -885,12 +861,15 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
+  * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
   * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
   */
- FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
+-FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
 -                                                  U16 const* tagTable, BYTE const* base,
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
 +                                                  BYTE const* tagTable, BYTE const* base,
                                                    U32 idx, U32 const hashLog,
 -                                                  U32 const rowLog, U32 const mls)
@@ -19184,8 +26419,20 @@ index 0298a01a7504..f6b4978ceba7 100644
      U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
      ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
      {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
-@@ -908,22 +870,21 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
-                                                         U32 const rowMask, U32 const useCache)
+@@ -902,28 +881,29 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab
+ /* ZSTD_row_update_internalImpl():
+  * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+-                                                        U32 updateStartIdx, U32 const updateEndIdx,
+-                                                        U32 const mls, U32 const rowLog,
+-                                                        U32 const rowMask, U32 const useCache)
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
++                                  U32 updateStartIdx, U32 const updateEndIdx,
++                                  U32 const mls, U32 const rowLog,
++                                  U32 const rowMask, U32 const useCache)
  {
      U32* const hashTable = ms->hashTable;
 -    U16* const tagTable = ms->tagTable;
@@ -19213,7 +26460,22 @@ index 0298a01a7504..f6b4978ceba7 100644
          row[pos] = updateStartIdx;
      }
  }
-@@ -971,7 +932,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
+@@ -932,9 +912,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+  * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
+  * Skips sections of long matches as is necessary.
+  */
+-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
+-                                                    U32 const mls, U32 const rowLog,
+-                                                    U32 const rowMask, U32 const useCache)
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
++                              U32 const mls, U32 const rowLog,
++                              U32 const rowMask, U32 const useCache)
+ {
+     U32 idx = ms->nextToUpdate;
+     const BYTE* const base = ms->window.base;
+@@ -971,7 +953,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
      const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
  
      DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
@@ -19250,7 +26512,7 @@ index 0298a01a7504..f6b4978ceba7 100644
  }
  
  #if defined(ZSTD_ARCH_X86_SSE2)
-@@ -994,71 +983,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
+@@ -994,71 +1004,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
  }
  #endif
  
@@ -19382,7 +26644,7 @@ index 0298a01a7504..f6b4978ceba7 100644
          const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
          const size_t xFF = ~((size_t)0);
          const size_t x01 = xFF / 0xFF;
-@@ -1091,11 +1091,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
+@@ -1091,11 +1112,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
          }
          matches = ~matches;
          if (rowEntries == 16) {
@@ -19397,7 +26659,38 @@ index 0298a01a7504..f6b4978ceba7 100644
          }
      }
  #endif
-@@ -1125,7 +1125,7 @@ size_t ZSTD_RowFindBestMatch(
+@@ -1103,20 +1124,21 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
+ 
+ /* The high-level approach of the SIMD row based match finder is as follows:
+  * - Figure out where to insert the new entry:
+- *      - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
+- *      - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
++ *      - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index.
++ *           - The hash is salted by a value that changes on every contex reset, so when the same table is used
++ *             we will avoid collisions that would otherwise slow us down by intorducing phantom matches.
++ *      - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines
+  *        which row to insert into.
+- *      - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
+- *        be considered as a circular buffer with a "head" index that resides in the tagTable.
+- *      - Also insert the "tag" into the equivalent row and position in the tagTable.
+- *          - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
+- *                  The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
+- *                  for alignment/performance reasons, leaving some bytes unused.
+- * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
++ *      - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can
++ *        be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes
++ *        per row).
++ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and
+  *   generate a bitfield that we can cycle through to check the collisions in the hash table.
+  * - Pick the longest match.
++ * - Insert the tag into the equivalent row and position in the tagTable.
+  */
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_RowFindBestMatch(
+                         ZSTD_matchState_t* ms,
+                         const BYTE* const ip, const BYTE* const iLimit,
+@@ -1125,7 +1147,7 @@ size_t ZSTD_RowFindBestMatch(
                          const U32 rowLog)
  {
      U32* const hashTable = ms->hashTable;
@@ -19406,7 +26699,7 @@ index 0298a01a7504..f6b4978ceba7 100644
      U32* const hashCache = ms->hashCache;
      const U32 hashLog = ms->rowHashLog;
      const ZSTD_compressionParameters* const cParams = &ms->cParams;
-@@ -1143,8 +1143,11 @@ size_t ZSTD_RowFindBestMatch(
+@@ -1143,8 +1165,11 @@ size_t ZSTD_RowFindBestMatch(
      const U32 rowEntries = (1U << rowLog);
      const U32 rowMask = rowEntries - 1;
      const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
@@ -19418,7 +26711,7 @@ index 0298a01a7504..f6b4978ceba7 100644
  
      /* DMS/DDS variables that may be referenced laster */
      const ZSTD_matchState_t* const dms = ms->dictMatchState;
-@@ -1168,7 +1171,7 @@ size_t ZSTD_RowFindBestMatch(
+@@ -1168,7 +1193,7 @@ size_t ZSTD_RowFindBestMatch(
      if (dictMode == ZSTD_dictMatchState) {
          /* Prefetch DMS rows */
          U32* const dmsHashTable = dms->hashTable;
@@ -19427,7 +26720,7 @@ index 0298a01a7504..f6b4978ceba7 100644
          U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
          U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
          dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
-@@ -1178,23 +1181,34 @@ size_t ZSTD_RowFindBestMatch(
+@@ -1178,23 +1203,34 @@ size_t ZSTD_RowFindBestMatch(
      }
  
      /* Update the hashTable and tagTable up to (but not including) ip */
@@ -19468,7 +26761,7 @@ index 0298a01a7504..f6b4978ceba7 100644
              assert(numMatches < rowEntries);
              if (matchIndex < lowLimit)
                  break;
-@@ -1204,13 +1218,14 @@ size_t ZSTD_RowFindBestMatch(
+@@ -1204,13 +1240,14 @@ size_t ZSTD_RowFindBestMatch(
                  PREFETCH_L1(dictBase + matchIndex);
              }
              matchBuffer[numMatches++] = matchIndex;
@@ -19484,7 +26777,7 @@ index 0298a01a7504..f6b4978ceba7 100644
              row[pos] = ms->nextToUpdate++;
          }
  
-@@ -1224,7 +1239,8 @@ size_t ZSTD_RowFindBestMatch(
+@@ -1224,7 +1261,8 @@ size_t ZSTD_RowFindBestMatch(
              if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
                  const BYTE* const match = base + matchIndex;
                  assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
@@ -19494,7 +26787,7 @@ index 0298a01a7504..f6b4978ceba7 100644
                      currentMl = ZSTD_count(ip, match, iLimit);
              } else {
                  const BYTE* const match = dictBase + matchIndex;
-@@ -1236,7 +1252,7 @@ size_t ZSTD_RowFindBestMatch(
+@@ -1236,7 +1274,7 @@ size_t ZSTD_RowFindBestMatch(
              /* Save best solution */
              if (currentMl > ml) {
                  ml = currentMl;
@@ -19503,7 +26796,7 @@ index 0298a01a7504..f6b4978ceba7 100644
                  if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
              }
          }
-@@ -1254,19 +1270,21 @@ size_t ZSTD_RowFindBestMatch(
+@@ -1254,19 +1292,21 @@ size_t ZSTD_RowFindBestMatch(
          const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
          const U32 dmsIndexDelta        = dictLimit - dmsSize;
  
@@ -19529,7 +26822,7 @@ index 0298a01a7504..f6b4978ceba7 100644
              }
  
              /* Return the longest match */
-@@ -1285,7 +1303,7 @@ size_t ZSTD_RowFindBestMatch(
+@@ -1285,7 +1325,7 @@ size_t ZSTD_RowFindBestMatch(
                  if (currentMl > ml) {
                      ml = currentMl;
                      assert(curr > matchIndex + dmsIndexDelta);
@@ -19538,7 +26831,19 @@ index 0298a01a7504..f6b4978ceba7 100644
                      if (ip+currentMl == iLimit) break;
                  }
              }
-@@ -1491,7 +1509,8 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1472,8 +1512,9 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
+ *  Common parser - lazy strategy
+ *********************************/
+ 
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_compressBlock_lazy_generic(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_compressBlock_lazy_generic(
+                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                         U32 rep[ZSTD_REP_NUM],
+                         const void* src, size_t srcSize,
+@@ -1491,7 +1532,8 @@ ZSTD_compressBlock_lazy_generic(
      const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
      const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
  
@@ -19548,7 +26853,7 @@ index 0298a01a7504..f6b4978ceba7 100644
  
      const int isDMS = dictMode == ZSTD_dictMatchState;
      const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
-@@ -1512,8 +1531,8 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1512,8 +1554,8 @@ ZSTD_compressBlock_lazy_generic(
          U32 const curr = (U32)(ip - base);
          U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
          U32 const maxRep = curr - windowLow;
@@ -19559,7 +26864,7 @@ index 0298a01a7504..f6b4978ceba7 100644
      }
      if (isDxS) {
          /* dictMatchState repCode checks don't currently handle repCode == 0
-@@ -1522,10 +1541,11 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1522,10 +1564,11 @@ ZSTD_compressBlock_lazy_generic(
          assert(offset_2 <= dictAndPrefixLength);
      }
  
@@ -19574,7 +26879,7 @@ index 0298a01a7504..f6b4978ceba7 100644
      }
  
      /* Match Loop */
-@@ -1537,7 +1557,7 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1537,7 +1580,7 @@ ZSTD_compressBlock_lazy_generic(
  #endif
      while (ip < ilimit) {
          size_t matchLength=0;
@@ -19583,7 +26888,7 @@ index 0298a01a7504..f6b4978ceba7 100644
          const BYTE* start=ip+1;
          DEBUGLOG(7, "search baseline (depth 0)");
  
-@@ -1562,14 +1582,23 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1562,14 +1605,23 @@ ZSTD_compressBlock_lazy_generic(
          }
  
          /* first search (depth 0) */
@@ -19611,7 +26916,7 @@ index 0298a01a7504..f6b4978ceba7 100644
              continue;
          }
  
-@@ -1579,12 +1608,12 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1579,12 +1631,12 @@ ZSTD_compressBlock_lazy_generic(
              DEBUGLOG(7, "search depth 1");
              ip ++;
              if ( (dictMode == ZSTD_noDict)
@@ -19627,7 +26932,7 @@ index 0298a01a7504..f6b4978ceba7 100644
              }
              if (isDxS) {
                  const U32 repIndex = (U32)(ip - base) - offset_1;
-@@ -1596,17 +1625,17 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1596,17 +1648,17 @@ ZSTD_compressBlock_lazy_generic(
                      const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
                      size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
                      int const gain2 = (int)(mlRep * 3);
@@ -19652,7 +26957,7 @@ index 0298a01a7504..f6b4978ceba7 100644
                      continue;   /* search a better one */
              }   }
  
-@@ -1615,12 +1644,12 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1615,12 +1667,12 @@ ZSTD_compressBlock_lazy_generic(
                  DEBUGLOG(7, "search depth 2");
                  ip ++;
                  if ( (dictMode == ZSTD_noDict)
@@ -19668,7 +26973,7 @@ index 0298a01a7504..f6b4978ceba7 100644
                  }
                  if (isDxS) {
                      const U32 repIndex = (U32)(ip - base) - offset_1;
-@@ -1632,17 +1661,17 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1632,17 +1684,17 @@ ZSTD_compressBlock_lazy_generic(
                          const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
                          size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
                          int const gain2 = (int)(mlRep * 4);
@@ -19693,7 +26998,7 @@ index 0298a01a7504..f6b4978ceba7 100644
                          continue;
              }   }   }
              break;  /* nothing found : store previous solution */
-@@ -1653,26 +1682,33 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1653,26 +1705,33 @@ ZSTD_compressBlock_lazy_generic(
           * notably if `value` is unsigned, resulting in a large positive `-value`.
           */
          /* catch up */
@@ -19733,7 +27038,7 @@ index 0298a01a7504..f6b4978ceba7 100644
  
          /* check immediate repcode */
          if (isDxS) {
-@@ -1686,8 +1722,8 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1686,8 +1745,8 @@ ZSTD_compressBlock_lazy_generic(
                     && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
                      const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
                      matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
@@ -19744,7 +27049,7 @@ index 0298a01a7504..f6b4978ceba7 100644
                      ip += matchLength;
                      anchor = ip;
                      continue;
-@@ -1701,16 +1737,20 @@ ZSTD_compressBlock_lazy_generic(
+@@ -1701,166 +1760,181 @@ ZSTD_compressBlock_lazy_generic(
                   && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
                  /* store sequence */
                  matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
@@ -19770,7 +27075,209 @@ index 0298a01a7504..f6b4978ceba7 100644
  
      /* Return the last literals size */
      return (size_t)(iend - anchor);
-@@ -1886,12 +1926,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+ }
++#endif /* build exclusions */
+ 
+ 
+-size_t ZSTD_compressBlock_btlazy2(
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_greedy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2(
++size_t ZSTD_compressBlock_greedy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy(
++size_t ZSTD_compressBlock_greedy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
++size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2_dictMatchState(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_dictMatchState(
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dictMatchState(
++size_t ZSTD_compressBlock_lazy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
+ }
+ 
+-
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
+ }
+ 
+-/* Row-based matchfinder */
+-size_t ZSTD_compressBlock_lazy2_row(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_row(
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy2_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
+ }
+ 
+-
+ size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
+ }
+ 
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
++    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
+ }
++#endif
+ 
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_compressBlock_lazy_extDict_generic(
+                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                         U32 rep[ZSTD_REP_NUM],
+@@ -1886,12 +1960,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
  
      DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
  
@@ -19787,7 +27294,7 @@ index 0298a01a7504..f6b4978ceba7 100644
      }
  
      /* Match Loop */
-@@ -1903,7 +1944,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+@@ -1903,7 +1978,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
  #endif
      while (ip < ilimit) {
          size_t matchLength=0;
@@ -19796,7 +27303,7 @@ index 0298a01a7504..f6b4978ceba7 100644
          const BYTE* start=ip+1;
          U32 curr = (U32)(ip-base);
  
-@@ -1922,14 +1963,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+@@ -1922,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
          }   }
  
          /* first search (depth 0) */
@@ -19824,7 +27331,7 @@ index 0298a01a7504..f6b4978ceba7 100644
              continue;
          }
  
-@@ -1939,7 +1989,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+@@ -1939,7 +2023,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
              ip ++;
              curr++;
              /* check repCode */
@@ -19833,7 +27340,7 @@ index 0298a01a7504..f6b4978ceba7 100644
                  const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
                  const U32 repIndex = (U32)(curr - offset_1);
                  const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
-@@ -1951,18 +2001,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+@@ -1951,18 +2035,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                      const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                      size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                      int const gain2 = (int)(repLength * 3);
@@ -19859,7 +27366,7 @@ index 0298a01a7504..f6b4978ceba7 100644
                      continue;   /* search a better one */
              }   }
  
-@@ -1971,7 +2021,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+@@ -1971,7 +2055,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                  ip ++;
                  curr++;
                  /* check repCode */
@@ -19868,7 +27375,7 @@ index 0298a01a7504..f6b4978ceba7 100644
                      const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
                      const U32 repIndex = (U32)(curr - offset_1);
                      const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
-@@ -1983,38 +2033,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+@@ -1983,38 +2067,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                          const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                          size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                          int const gain2 = (int)(repLength * 4);
@@ -19925,7 +27432,7 @@ index 0298a01a7504..f6b4978ceba7 100644
  
          /* check immediate repcode */
          while (ip <= ilimit) {
-@@ -2029,8 +2086,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+@@ -2029,8 +2120,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                  /* repcode detected we should take it */
                  const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                  matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
@@ -19936,16 +27443,89 @@ index 0298a01a7504..f6b4978ceba7 100644
                  ip += matchLength;
                  anchor = ip;
                  continue;   /* faster when present ... (?) */
-@@ -2096,7 +2153,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row(
- size_t ZSTD_compressBlock_lazy2_extDict_row(
+@@ -2045,8 +2136,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
+-
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_greedy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+@@ -2054,49 +2146,55 @@ size_t ZSTD_compressBlock_greedy_extDict(
+     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_extDict(
++size_t ZSTD_compressBlock_greedy_extDict_row(
          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
          void const* src, size_t srcSize)
 -
  {
-     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
  }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy2_extDict(
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
+ }
+ 
+-size_t ZSTD_compressBlock_btlazy2_extDict(
++size_t ZSTD_compressBlock_lazy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_greedy_extDict_row(
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
++
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
+ }
+ 
+-size_t ZSTD_compressBlock_lazy_extDict_row(
++size_t ZSTD_compressBlock_lazy2_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+-
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_lazy2_extDict_row(
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize)
+ 
+ {
+-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
++    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
+ }
++#endif
 diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
-index e5bdf4df8dde..9505bed93c03 100644
+index e5bdf4df8dde..22c9201f4e63 100644
 --- a/lib/zstd/compress/zstd_lazy.h
 +++ b/lib/zstd/compress/zstd_lazy.h
 @@ -1,5 +1,6 @@
@@ -19956,26 +27536,213 @@ index e5bdf4df8dde..9505bed93c03 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
-@@ -22,6 +23,8 @@
+@@ -22,98 +23,175 @@
   */
  #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
  
 +#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
 +
++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
  U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
  void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
  
-@@ -113,7 +116,7 @@ size_t ZSTD_compressBlock_lazy2_extDict_row(
+ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip);
+ 
+ void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */
++#endif
+ 
+-size_t ZSTD_compressBlock_btlazy2(
++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_greedy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2(
++size_t ZSTD_compressBlock_greedy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy(
++size_t ZSTD_compressBlock_greedy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy(
++size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_row(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_row(
++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_row(
++size_t ZSTD_compressBlock_greedy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
++size_t ZSTD_compressBlock_greedy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dictMatchState(
++
++#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy
++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_GREEDY NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dictMatchState(
++size_t ZSTD_compressBlock_lazy_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dictMatchState(
++size_t ZSTD_compressBlock_lazy_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
++size_t ZSTD_compressBlock_lazy_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
++
++#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy
++#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_LAZY NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_lazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_lazy2_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
++size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_greedy_extDict(
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_extDict(
++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_lazy2_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_greedy_extDict_row(
++size_t ZSTD_compressBlock_lazy2_extDict_row(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy_extDict_row(
++
++#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2
++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row
++#else
++#define ZSTD_COMPRESSBLOCK_LAZY2 NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL
++#endif
++
++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btlazy2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_lazy2_extDict_row(
++size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
  size_t ZSTD_compressBlock_btlazy2_extDict(
          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
          void const* src, size_t srcSize);
 -        
++
++#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL
++#endif
 +
  
  
  #endif /* ZSTD_LAZY_H */
 diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
-index dd86fc83e7dd..b7da76b0db7c 100644
+index dd86fc83e7dd..07f3bc6437ce 100644
 --- a/lib/zstd/compress/zstd_ldm.c
 +++ b/lib/zstd/compress/zstd_ldm.c
 @@ -1,5 +1,6 @@
@@ -19986,7 +27753,7 @@ index dd86fc83e7dd..b7da76b0db7c 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
-@@ -242,11 +243,11 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+@@ -242,11 +243,15 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
      switch(ms->cParams.strategy)
      {
      case ZSTD_fast:
@@ -19996,11 +27763,26 @@ index dd86fc83e7dd..b7da76b0db7c 100644
  
      case ZSTD_dfast:
 -        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
 +        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
++#else
++        assert(0); /* shouldn't be called: cparams should've been adjusted. */
++#endif
          break;
  
      case ZSTD_greedy:
-@@ -549,7 +550,7 @@ size_t ZSTD_ldm_generateSequences(
+@@ -318,7 +323,9 @@ static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
+     }
+ }
+ 
+-static size_t ZSTD_ldm_generateSequences_internal(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_ldm_generateSequences_internal(
+         ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
+         ldmParams_t const* params, void const* src, size_t srcSize)
+ {
+@@ -549,7 +556,7 @@ size_t ZSTD_ldm_generateSequences(
           * the window through early invalidation.
           * TODO: * Test the chunk size.
           *       * Try invalidation after the sequence generation and test the
@@ -20009,7 +27791,23 @@ index dd86fc83e7dd..b7da76b0db7c 100644
           *
           * NOTE: Because of dictionaries + sequence splitting we MUST make sure
           * that any offset used is valid at the END of the sequence, since it may
-@@ -711,7 +712,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+@@ -689,7 +696,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+         /* maybeSplitSequence updates rawSeqStore->pos */
+         rawSeq const sequence = maybeSplitSequence(rawSeqStore,
+                                                    (U32)(iend - ip), minMatch);
+-        int i;
+         /* End signal */
+         if (sequence.offset == 0)
+             break;
+@@ -702,6 +708,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+         /* Run the block compressor */
+         DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength);
+         {
++            int i;
+             size_t const newLitLength =
+                 blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
+             ip += sequence.litLength;
+@@ -711,7 +718,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
              rep[0] = sequence.offset;
              /* Store the sequence */
              ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
@@ -20043,7 +27841,7 @@ index 647f865be290..cfccfc46f6f7 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
-index fd82acfda62f..1e41cb04f482 100644
+index fd82acfda62f..a87b66ac8d24 100644
 --- a/lib/zstd/compress/zstd_opt.c
 +++ b/lib/zstd/compress/zstd_opt.c
 @@ -1,5 +1,6 @@
@@ -20054,7 +27852,14 @@ index fd82acfda62f..1e41cb04f482 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
-@@ -16,7 +17,7 @@
+@@ -12,11 +13,14 @@
+ #include "hist.h"
+ #include "zstd_opt.h"
+ 
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+ 
  #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
  #define ZSTD_MAX_PRICE     (1<<30)
  
@@ -20063,7 +27868,7 @@ index fd82acfda62f..1e41cb04f482 100644
  
  
  /*-*************************************
-@@ -26,27 +27,35 @@
+@@ -26,27 +30,35 @@
  #if 0    /* approximation at bit level (for tests) */
  #  define BITCOST_ACCURACY 0
  #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
@@ -20102,7 +27907,7 @@ index fd82acfda62f..1e41cb04f482 100644
      U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
      U32 const weight = BWeight + FWeight;
      assert(hb + BITCOST_ACCURACY < 31);
-@@ -57,7 +66,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+@@ -57,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
  /* debugging function,
   * @return price in bytes as fractional value
   * for debug messages only */
@@ -20111,7 +27916,7 @@ index fd82acfda62f..1e41cb04f482 100644
  {
      return (double)price / (BITCOST_MULTIPLIER*8);
  }
-@@ -88,20 +97,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
+@@ -88,20 +100,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
      return total;
  }
  
@@ -20143,7 +27948,7 @@ index fd82acfda62f..1e41cb04f482 100644
   * return the resulting sum of elements */
  static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
  {
-@@ -110,7 +125,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
+@@ -110,7 +128,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
      DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
      assert(logTarget < 30);
      if (factor <= 1) return prevsum;
@@ -20152,7 +27957,7 @@ index fd82acfda62f..1e41cb04f482 100644
  }
  
  /* ZSTD_rescaleFreqs() :
-@@ -129,18 +144,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+@@ -129,18 +147,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
      DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
      optPtr->priceType = zop_dynamic;
  
@@ -20179,7 +27984,7 @@ index fd82acfda62f..1e41cb04f482 100644
                  unsigned lit;
                  assert(optPtr->litFreq != NULL);
                  optPtr->litSum = 0;
-@@ -188,13 +207,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+@@ -188,13 +210,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
                      optPtr->offCodeSum += optPtr->offCodeFreq[of];
              }   }
  
@@ -20196,7 +28001,7 @@ index fd82acfda62f..1e41cb04f482 100644
              }
  
              {   unsigned const baseLLfreqs[MaxLL+1] = {
-@@ -224,10 +244,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
+@@ -224,10 +247,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
                  optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
              }
  
@@ -20208,7 +28013,15 @@ index fd82acfda62f..1e41cb04f482 100644
  
          if (compressedLiterals)
              optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
-@@ -255,11 +274,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+@@ -246,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+                                 const optState_t* const optPtr,
+                                 int optLevel)
+ {
++    DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength);
+     if (litLength == 0) return 0;
+ 
+     if (!ZSTD_compressedLiterals(optPtr))
+@@ -255,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
          return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
  
      /* dynamic statistics */
@@ -20226,7 +28039,7 @@ index fd82acfda62f..1e41cb04f482 100644
          }
          return price;
      }
-@@ -272,10 +294,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+@@ -272,10 +298,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
      assert(litLength <= ZSTD_BLOCKSIZE_MAX);
      if (optPtr->priceType == zop_predef)
          return WEIGHT(litLength, optLevel);
@@ -20242,7 +28055,7 @@ index fd82acfda62f..1e41cb04f482 100644
       */
      if (litLength == ZSTD_BLOCKSIZE_MAX)
          return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
-@@ -289,24 +312,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
+@@ -289,24 +316,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
  }
  
  /* ZSTD_getMatchPrice() :
@@ -20274,7 +28087,7 @@ index fd82acfda62f..1e41cb04f482 100644
  
      /* dynamic statistics */
      price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
-@@ -325,10 +349,10 @@ ZSTD_getMatchPrice(U32 const offcode,
+@@ -325,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offcode,
  }
  
  /* ZSTD_updateStats() :
@@ -20287,7 +28100,7 @@ index fd82acfda62f..1e41cb04f482 100644
  {
      /* literals */
      if (ZSTD_compressedLiterals(optPtr)) {
-@@ -344,8 +368,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
+@@ -344,8 +372,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
          optPtr->litLengthSum++;
      }
  
@@ -20298,11 +28111,53 @@ index fd82acfda62f..1e41cb04f482 100644
          assert(offCode <= MaxOff);
          optPtr->offCodeFreq[offCode]++;
          optPtr->offCodeSum++;
-@@ -552,16 +576,17 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
-     ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
+@@ -379,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
+ 
+ /* Update hashTable3 up to ip (excluded)
+    Assumption : always within prefix (i.e. not within extDict) */
+-static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
+-                                              U32* nextToUpdate3,
+-                                              const BYTE* const ip)
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
++                                       U32* nextToUpdate3,
++                                       const BYTE* const ip)
+ {
+     U32* const hashTable3 = ms->hashTable3;
+     U32 const hashLog3 = ms->hashLog3;
+@@ -408,7 +438,9 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
+  * @param ip assumed <= iend-8 .
+  * @param target The target of ZSTD_updateTree_internal() - we are filling to this position
+  * @return : nb of positions added */
+-static U32 ZSTD_insertBt1(
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_insertBt1(
+                 const ZSTD_matchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iend,
+                 U32 const target,
+@@ -527,6 +559,7 @@ static U32 ZSTD_insertBt1(
  }
  
--FORCE_INLINE_TEMPLATE
+ FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ void ZSTD_updateTree_internal(
+                 ZSTD_matchState_t* ms,
+                 const BYTE* const ip, const BYTE* const iend,
+@@ -535,7 +568,7 @@ void ZSTD_updateTree_internal(
+     const BYTE* const base = ms->window.base;
+     U32 const target = (U32)(ip - base);
+     U32 idx = ms->nextToUpdate;
+-    DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
++    DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
+                 idx, target, dictMode);
+ 
+     while(idx < target) {
+@@ -553,15 +586,18 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
+ }
+ 
+ FORCE_INLINE_TEMPLATE
 -U32 ZSTD_insertBtAndGetAllMatches (
 -                    ZSTD_match_t* matches,   /* store result (found matches) in this table (presumed large enough) */
 -                    ZSTD_matchState_t* ms,
@@ -20312,7 +28167,8 @@ index fd82acfda62f..1e41cb04f482 100644
 -                    U32 const ll0,   /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
 -                    const U32 lengthToBeat,
 -                    U32 const mls /* template */)
-+FORCE_INLINE_TEMPLATE U32
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32
 +ZSTD_insertBtAndGetAllMatches (
 +                ZSTD_match_t* matches,  /* store result (found matches) in this table (presumed large enough) */
 +                ZSTD_matchState_t* ms,
@@ -20326,7 +28182,7 @@ index fd82acfda62f..1e41cb04f482 100644
  {
      const ZSTD_compressionParameters* const cParams = &ms->cParams;
      U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
-@@ -644,7 +669,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+@@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
                  DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
                              repCode, ll0, repOffset, repLen);
                  bestLength = repLen;
@@ -20335,7 +28191,7 @@ index fd82acfda62f..1e41cb04f482 100644
                  matches[mnum].len = (U32)repLen;
                  mnum++;
                  if ( (repLen > sufficient_len)
-@@ -673,7 +698,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
+@@ -673,7 +709,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
                  bestLength = mlen;
                  assert(curr > matchIndex3);
                  assert(mnum==0);  /* no prior solution */
@@ -20344,7 +28200,7 @@ index fd82acfda62f..1e41cb04f482 100644
                  matches[0].len = (U32)mlen;
                  mnum = 1;
                  if ( (mlen > sufficient_len) |
-@@ -706,13 +731,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
+@@ -706,13 +742,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
          }
  
          if (matchLength > bestLength) {
@@ -20361,7 +28217,7 @@ index fd82acfda62f..1e41cb04f482 100644
              matches[mnum].len = (U32)matchLength;
              mnum++;
              if ( (matchLength > ZSTD_OPT_NUM)
-@@ -754,12 +779,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
+@@ -754,12 +790,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
  
              if (matchLength > bestLength) {
                  matchIndex = dictMatchIndex + dmsIndexDelta;
@@ -20377,7 +28233,18 @@ index fd82acfda62f..1e41cb04f482 100644
                  matches[mnum].len = (U32)matchLength;
                  mnum++;
                  if ( (matchLength > ZSTD_OPT_NUM)
-@@ -960,7 +985,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+@@ -792,7 +828,9 @@ typedef U32 (*ZSTD_getAllMatchesFn)(
+     U32 const ll0,
+     U32 const lengthToBeat);
+ 
+-FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++U32 ZSTD_btGetAllMatches_internal(
+         ZSTD_match_t* matches,
+         ZSTD_matchState_t* ms,
+         U32* nextToUpdate3,
+@@ -960,7 +998,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
                                        const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
  {
      U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
@@ -20386,7 +28253,7 @@ index fd82acfda62f..1e41cb04f482 100644
      U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
  
      /* Ensure that current block position is not outside of the match */
-@@ -971,11 +996,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
+@@ -971,11 +1009,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
      }
  
      if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
@@ -20402,16 +28269,84 @@ index fd82acfda62f..1e41cb04f482 100644
          (*nbMatches)++;
      }
  }
-@@ -1062,6 +1087,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
-     ZSTD_optimal_t lastSequence;
+@@ -1011,11 +1049,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
+ *  Optimal parser
+ *********************************/
+ 
+-static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
+-{
+-    return sol.litlen + sol.mlen;
+-}
+-
+ #if 0 /* debug */
+ 
+ static void
+@@ -1033,7 +1066,13 @@ listStats(const U32* table, int lastEltID)
+ 
+ #endif
+ 
+-FORCE_INLINE_TEMPLATE size_t
++#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel)
++#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel)
++#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1))
++
++FORCE_INLINE_TEMPLATE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t
+ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                                seqStore_t* seqStore,
+                                U32 rep[ZSTD_REP_NUM],
+@@ -1059,9 +1098,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ 
+     ZSTD_optimal_t* const opt = optStatePtr->priceTable;
+     ZSTD_match_t* const matches = optStatePtr->matchTable;
+-    ZSTD_optimal_t lastSequence;
++    ZSTD_optimal_t lastStretch;
      ZSTD_optLdm_t optLdm;
  
-+    ZSTD_memset(&lastSequence, 0, sizeof(ZSTD_optimal_t));
++    ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t));
 +
      optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
      optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
      ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
-@@ -1098,14 +1125,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+@@ -1082,103 +1123,139 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+             U32 const ll0 = !litlen;
+             U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);
+             ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
+-                                              (U32)(ip-istart), (U32)(iend - ip));
+-            if (!nbMatches) { ip++; continue; }
++                                              (U32)(ip-istart), (U32)(iend-ip));
++            if (!nbMatches) {
++                DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart));
++                ip++;
++                continue;
++            }
++
++            /* Match found: let's store this solution, and eventually find more candidates.
++             * During this forward pass, @opt is used to store stretches,
++             * defined as "a match followed by N literals".
++             * Note how this is different from a Sequence, which is "N literals followed by a match".
++             * Storing stretches allows us to store different match predecessors
++             * for each literal position part of a literals run. */
+ 
+             /* initialize opt[0] */
+-            { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
+-            opt[0].mlen = 0;  /* means is_a_literal */
++            opt[0].mlen = 0;  /* there are only literals so far */
+             opt[0].litlen = litlen;
+-            /* We don't need to include the actual price of the literals because
+-             * it is static for the duration of the forward pass, and is included
+-             * in every price. We include the literal length to avoid negative
+-             * prices when we subtract the previous literal length.
++            /* No need to include the actual price of the literals before the first match
++             * because it is static for the duration of the forward pass, and is included
++             * in every subsequent price. But, we include the literal length because
++             * the cost variation of litlen depends on the value of litlen.
+              */
+-            opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);
++            opt[0].price = LL_PRICE(litlen);
++            ZSTD_STATIC_ASSERT(sizeof(opt[0].rep[0]) == sizeof(rep[0]));
++            ZSTD_memcpy(&opt[0].rep, rep, sizeof(opt[0].rep));
  
              /* large match -> immediate encoding */
              {   U32 const maxML = matches[nbMatches-1].len;
@@ -20423,15 +28358,32 @@ index fd82acfda62f..1e41cb04f482 100644
 +                            nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
  
                  if (maxML > sufficient_len) {
-                     lastSequence.litlen = litlen;
-                     lastSequence.mlen = maxML;
+-                    lastSequence.litlen = litlen;
+-                    lastSequence.mlen = maxML;
 -                    lastSequence.off = maxOffcode;
-+                    lastSequence.off = maxOffBase;
-                     DEBUGLOG(6, "large match (%u>%u), immediate encoding",
+-                    DEBUGLOG(6, "large match (%u>%u), immediate encoding",
++                    lastStretch.litlen = 0;
++                    lastStretch.mlen = maxML;
++                    lastStretch.off = maxOffBase;
++                    DEBUGLOG(6, "large match (%u>%u) => immediate encoding",
                                  maxML, sufficient_len);
                      cur = 0;
-@@ -1122,15 +1149,15 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
-                     opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
+-                    last_pos = ZSTD_totalLen(lastSequence);
++                    last_pos = maxML;
+                     goto _shortestPath;
+             }   }
+ 
+             /* set prices for first matches starting position == 0 */
+             assert(opt[0].price >= 0);
+-            {   U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+-                U32 pos;
++            {   U32 pos;
+                 U32 matchNb;
+                 for (pos = 1; pos < minMatch; pos++) {
+-                    opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
++                    opt[pos].price = ZSTD_MAX_PRICE;
++                    opt[pos].mlen = 0;
++                    opt[pos].litlen = litlen + pos;
                  }
                  for (matchNb = 0; matchNb < nbMatches; matchNb++) {
 -                    U32 const offcode = matches[matchNb].off;
@@ -20439,27 +28391,295 @@ index fd82acfda62f..1e41cb04f482 100644
                      U32 const end = matches[matchNb].len;
                      for ( ; pos <= end ; pos++ ) {
 -                        U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
-+                        U32 const matchPrice = ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
-                         U32 const sequencePrice = literalsPrice + matchPrice;
+-                        U32 const sequencePrice = literalsPrice + matchPrice;
++                        int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
++                        int const sequencePrice = opt[0].price + matchPrice;
                          DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
--                                    pos, ZSTD_fCost(sequencePrice));
-+                                    pos, ZSTD_fCost((int)sequencePrice));
+                                     pos, ZSTD_fCost(sequencePrice));
                          opt[pos].mlen = pos;
 -                        opt[pos].off = offcode;
+-                        opt[pos].litlen = litlen;
+-                        opt[pos].price = (int)sequencePrice;
+-                }   }
 +                        opt[pos].off = offBase;
-                         opt[pos].litlen = litlen;
-                         opt[pos].price = (int)sequencePrice;
++                        opt[pos].litlen = 0; /* end of match */
++                        opt[pos].price = sequencePrice + LL_PRICE(0);
++                    }
++                }
+                 last_pos = pos-1;
++                opt[pos].price = ZSTD_MAX_PRICE;
+             }
+         }
+ 
+         /* check further positions */
+         for (cur = 1; cur <= last_pos; cur++) {
+             const BYTE* const inr = ip + cur;
+-            assert(cur < ZSTD_OPT_NUM);
+-            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
++            assert(cur <= ZSTD_OPT_NUM);
++            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur);
+ 
+             /* Fix current position with one literal if cheaper */
+-            {   U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
++            {   U32 const litlen = opt[cur-1].litlen + 1;
+                 int const price = opt[cur-1].price
+-                                + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
+-                                + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
+-                                - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
++                                + LIT_PRICE(ip+cur-1)
++                                + LL_INCPRICE(litlen);
+                 assert(price < 1000000000); /* overflow check */
+                 if (price <= opt[cur].price) {
++                    ZSTD_optimal_t const prevMatch = opt[cur];
+                     DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
+                                 inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
+                                 opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
+-                    opt[cur].mlen = 0;
+-                    opt[cur].off = 0;
++                    opt[cur] = opt[cur-1];
+                     opt[cur].litlen = litlen;
+                     opt[cur].price = price;
++                    if ( (optLevel >= 1) /* additional check only for higher modes */
++                      && (prevMatch.litlen == 0) /* replace a match */
++                      && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */
++                      && LIKELY(ip + cur < iend)
++                    ) {
++                        /* check next position, in case it would be cheaper */
++                        int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1);
++                        int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1);
++                        DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f",
++                                cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals));
++                        if ( (with1literal < withMoreLiterals)
++                          && (with1literal < opt[cur+1].price) ) {
++                            /* update offset history - before it disappears */
++                            U32 const prev = cur - prevMatch.mlen;
++                            repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0);
++                            assert(cur >= prevMatch.mlen);
++                            DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !",
++                                        ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals),
++                                        newReps.rep[0], newReps.rep[1], newReps.rep[2] );
++                            opt[cur+1] = prevMatch;  /* mlen & offbase */
++                            ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(repcodes_t));
++                            opt[cur+1].litlen = 1;
++                            opt[cur+1].price = with1literal;
++                            if (last_pos < cur+1) last_pos = cur+1;
++                        }
++                    }
+                 } else {
+-                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
+-                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
+-                                opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
++                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f)",
++                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price));
+                 }
+             }
+ 
+-            /* Set the repcodes of the current position. We must do it here
+-             * because we rely on the repcodes of the 2nd to last sequence being
+-             * correct to set the next chunks repcodes during the backward
+-             * traversal.
++            /* Offset history is not updated during match comparison.
++             * Do it here, now that the match is selected and confirmed.
+              */
+             ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t));
+             assert(cur >= opt[cur].mlen);
+-            if (opt[cur].mlen != 0) {
++            if (opt[cur].litlen == 0) {
++                /* just finished a match => alter offset history */
+                 U32 const prev = cur - opt[cur].mlen;
+-                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);
++                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0);
+                 ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
+-            } else {
+-                ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t));
+             }
+ 
+             /* last match must start at a minimum distance of 8 from oend */
+@@ -1188,15 +1265,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ 
+             if ( (optLevel==0) /*static_test*/
+               && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
+-                DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
++                DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1);
+                 continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+             }
+ 
+             assert(opt[cur].price >= 0);
+-            {   U32 const ll0 = (opt[cur].mlen != 0);
+-                U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
+-                U32 const previousPrice = (U32)opt[cur].price;
+-                U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
++            {   U32 const ll0 = (opt[cur].litlen == 0);
++                int const previousPrice = opt[cur].price;
++                int const basePrice = previousPrice + LL_PRICE(0);
+                 U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);
+                 U32 matchNb;
+ 
+@@ -1208,18 +1284,17 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                     continue;
+                 }
+ 
+-                {   U32 const maxML = matches[nbMatches-1].len;
+-                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
+-                                inr-istart, cur, nbMatches, maxML);
+-
+-                    if ( (maxML > sufficient_len)
+-                      || (cur + maxML >= ZSTD_OPT_NUM) ) {
+-                        lastSequence.mlen = maxML;
+-                        lastSequence.off = matches[nbMatches-1].off;
+-                        lastSequence.litlen = litlen;
+-                        cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0;  /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
+-                        last_pos = cur + ZSTD_totalLen(lastSequence);
+-                        if (cur > ZSTD_OPT_NUM) cur = 0;   /* underflow => first match */
++                {   U32 const longestML = matches[nbMatches-1].len;
++                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of longest ML=%u",
++                                inr-istart, cur, nbMatches, longestML);
++
++                    if ( (longestML > sufficient_len)
++                      || (cur + longestML >= ZSTD_OPT_NUM)
++                      || (ip + cur + longestML >= iend) ) {
++                        lastStretch.mlen = longestML;
++                        lastStretch.off = matches[nbMatches-1].off;
++                        lastStretch.litlen = 0;
++                        last_pos = cur + longestML;
+                         goto _shortestPath;
                  }   }
-@@ -1230,7 +1257,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+ 
+@@ -1230,20 +1305,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                      U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
                      U32 mlen;
  
 -                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
+-                                matchNb, matches[matchNb].off, lastML, litlen);
 +                    DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
-                                 matchNb, matches[matchNb].off, lastML, litlen);
++                                matchNb, matches[matchNb].off, lastML, opt[cur].litlen);
  
                      for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
-@@ -1296,7 +1323,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                         U32 const pos = cur + mlen;
+-                        int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
++                        int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+ 
+                         if ((pos > last_pos) || (price < opt[pos].price)) {
+                             DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
+                                         pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+-                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }   /* fill empty positions */
++                            while (last_pos < pos) {
++                                /* fill empty positions, for future comparisons */
++                                last_pos++;
++                                opt[last_pos].price = ZSTD_MAX_PRICE;
++                                opt[last_pos].litlen = !0;  /* just needs to be != 0, to mean "not an end of match" */
++                            }
+                             opt[pos].mlen = mlen;
+                             opt[pos].off = offset;
+-                            opt[pos].litlen = litlen;
++                            opt[pos].litlen = 0;
+                             opt[pos].price = price;
+                         } else {
+                             DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
+@@ -1251,52 +1331,86 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                             if (optLevel==0) break;  /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
+                         }
+             }   }   }
++            opt[last_pos+1].price = ZSTD_MAX_PRICE;
+         }  /* for (cur = 1; cur <= last_pos; cur++) */
+ 
+-        lastSequence = opt[last_pos];
+-        cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0;  /* single sequence, and it starts before `ip` */
+-        assert(cur < ZSTD_OPT_NUM);  /* control overflow*/
++        lastStretch = opt[last_pos];
++        assert(cur >= lastStretch.mlen);
++        cur = last_pos - lastStretch.mlen;
+ 
+ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
+         assert(opt[0].mlen == 0);
++        assert(last_pos >= lastStretch.mlen);
++        assert(cur == last_pos - lastStretch.mlen);
+ 
+-        /* Set the next chunk's repcodes based on the repcodes of the beginning
+-         * of the last match, and the last sequence. This avoids us having to
+-         * update them while traversing the sequences.
+-         */
+-        if (lastSequence.mlen != 0) {
+-            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);
+-            ZSTD_memcpy(rep, &reps, sizeof(reps));
++        if (lastStretch.mlen==0) {
++            /* no solution : all matches have been converted into literals */
++            assert(lastStretch.litlen == (ip - anchor) + last_pos);
++            ip += last_pos;
++            continue;
++        }
++        assert(lastStretch.off > 0);
++
++        /* Update offset history */
++        if (lastStretch.litlen == 0) {
++            /* finishing on a match : update offset history */
++            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0);
++            ZSTD_memcpy(rep, &reps, sizeof(repcodes_t));
+         } else {
+-            ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
++            ZSTD_memcpy(rep, lastStretch.rep, sizeof(repcodes_t));
++            assert(cur >= lastStretch.litlen);
++            cur -= lastStretch.litlen;
+         }
+ 
+-        {   U32 const storeEnd = cur + 1;
++        /* Let's write the shortest path solution.
++         * It is stored in @opt in reverse order,
++         * starting from @storeEnd (==cur+2),
++         * effectively partially @opt overwriting.
++         * Content is changed too:
++         * - So far, @opt stored stretches, aka a match followed by literals
++         * - Now, it will store sequences, aka literals followed by a match
++         */
++        {   U32 const storeEnd = cur + 2;
+             U32 storeStart = storeEnd;
+-            U32 seqPos = cur;
++            U32 stretchPos = cur;
+ 
+             DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
+                         last_pos, cur); (void)last_pos;
+-            assert(storeEnd < ZSTD_OPT_NUM);
+-            DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+-                        storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
+-            opt[storeEnd] = lastSequence;
+-            while (seqPos > 0) {
+-                U32 const backDist = ZSTD_totalLen(opt[seqPos]);
++            assert(storeEnd < ZSTD_OPT_SIZE);
++            DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
++                        storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off);
++            if (lastStretch.litlen > 0) {
++                /* last "sequence" is unfinished: just a bunch of literals */
++                opt[storeEnd].litlen = lastStretch.litlen;
++                opt[storeEnd].mlen = 0;
++                storeStart = storeEnd-1;
++                opt[storeStart] = lastStretch;
++            } {
++                opt[storeEnd] = lastStretch;  /* note: litlen will be fixed */
++                storeStart = storeEnd;
++            }
++            while (1) {
++                ZSTD_optimal_t nextStretch = opt[stretchPos];
++                opt[storeStart].litlen = nextStretch.litlen;
++                DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)",
++                            opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off);
++                if (nextStretch.mlen == 0) {
++                    /* reaching beginning of segment */
++                    break;
++                }
+                 storeStart--;
+-                DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+-                            seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
+-                opt[storeStart] = opt[seqPos];
+-                seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
++                opt[storeStart] = nextStretch; /* note: litlen will be fixed */
++                assert(nextStretch.litlen + nextStretch.mlen <= stretchPos);
++                stretchPos -= nextStretch.litlen + nextStretch.mlen;
+             }
+ 
+             /* save sequences */
+-            DEBUGLOG(6, "sending selected sequences into seqStore")
++            DEBUGLOG(6, "sending selected sequences into seqStore");
+             {   U32 storePos;
                  for (storePos=storeStart; storePos <= storeEnd; storePos++) {
                      U32 const llen = opt[storePos].litlen;
                      U32 const mlen = opt[storePos].mlen;
@@ -20468,7 +28688,7 @@ index fd82acfda62f..1e41cb04f482 100644
                      U32 const advance = llen + mlen;
                      DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
                                  anchor - istart, (unsigned)llen, (unsigned)mlen);
-@@ -1308,8 +1335,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+@@ -1308,11 +1422,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                      }
  
                      assert(anchor + llen <= iend);
@@ -20479,16 +28699,71 @@ index fd82acfda62f..1e41cb04f482 100644
                      anchor += advance;
                      ip = anchor;
              }   }
-@@ -1349,7 +1376,7 @@ size_t ZSTD_compressBlock_btopt(
++            DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]);
++
++            /* update all costs */
+             ZSTD_setBasePrices(optStatePtr, optLevel);
+         }
+     }   /* while (ip < ilimit) */
+@@ -1320,21 +1437,27 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+     /* Return the last literals size */
+     return (size_t)(iend - anchor);
+ }
++#endif /* build exclusions */
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ static size_t ZSTD_compressBlock_opt0(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+ {
+     return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+ static size_t ZSTD_compressBlock_opt2(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+ {
+     return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+@@ -1342,20 +1465,23 @@ size_t ZSTD_compressBlock_btopt(
+     DEBUGLOG(5, "ZSTD_compressBlock_btopt");
+     return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+ }
++#endif
+ 
+ 
+ 
+ 
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
  /* ZSTD_initStats_ultra():
   * make a first compression pass, just to seed stats with more accurate starting values.
   * only works on first block, with no dictionary and no ldm.
 - * this function cannot error, hence its contract must be respected.
 + * this function cannot error out, its narrow contract must be respected.
   */
- static void
- ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
-@@ -1368,7 +1395,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+-static void
+-ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+-                     seqStore_t* seqStore,
+-                     U32 rep[ZSTD_REP_NUM],
+-               const void* src, size_t srcSize)
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++void ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
++                          seqStore_t* seqStore,
++                          U32 rep[ZSTD_REP_NUM],
++                    const void* src, size_t srcSize)
+ {
+     U32 tmpRep[ZSTD_REP_NUM];  /* updated rep codes will sink here */
+     ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep));
+@@ -1368,7 +1494,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
  
      ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
  
@@ -20497,7 +28772,7 @@ index fd82acfda62f..1e41cb04f482 100644
      ZSTD_resetSeqStore(seqStore);
      ms->window.base -= srcSize;
      ms->window.dictLimit += (U32)srcSize;
-@@ -1392,20 +1419,20 @@ size_t ZSTD_compressBlock_btultra2(
+@@ -1392,10 +1518,10 @@ size_t ZSTD_compressBlock_btultra2(
      U32 const curr = (U32)((const BYTE*)src - ms->window.base);
      DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
  
@@ -20511,9 +28786,7 @@ index fd82acfda62f..1e41cb04f482 100644
       * Consequently, this can only work if no data has been previously loaded in tables,
       * aka, no dictionary, no prefix, no ldm preprocessing.
       * The compression ratio gain is generally small (~0.5% on first block),
--     * the cost is 2x cpu time on first block. */
-+    ** the cost is 2x cpu time on first block. */
-     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+@@ -1404,15 +1530,17 @@ size_t ZSTD_compressBlock_btultra2(
      if ( (ms->opt.litLengthSum==0)   /* first block */
        && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
        && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
@@ -20524,8 +28797,50 @@ index fd82acfda62f..1e41cb04f482 100644
        ) {
          ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
      }
+ 
+     return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
+ }
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+@@ -1420,18 +1548,20 @@ size_t ZSTD_compressBlock_btopt_dictMatchState(
+     return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+ }
+ 
+-size_t ZSTD_compressBlock_btultra_dictMatchState(
++size_t ZSTD_compressBlock_btopt_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
++    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+ }
++#endif
+ 
+-size_t ZSTD_compressBlock_btopt_extDict(
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btultra_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         const void* src, size_t srcSize)
+ {
+-    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
++    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+ }
+ 
+ size_t ZSTD_compressBlock_btultra_extDict(
+@@ -1440,6 +1570,7 @@ size_t ZSTD_compressBlock_btultra_extDict(
+ {
+     return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+ }
++#endif
+ 
+ /* note : no btultra2 variant for extDict nor dictMatchState,
+  * because btultra2 is not meant to work with dictionaries
 diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h
-index 22b862858ba7..faa73ff4b03d 100644
+index 22b862858ba7..ac1b743d27cd 100644
 --- a/lib/zstd/compress/zstd_opt.h
 +++ b/lib/zstd/compress/zstd_opt.h
 @@ -1,5 +1,6 @@
@@ -20536,8 +28851,77 @@ index 22b862858ba7..faa73ff4b03d 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
+@@ -14,30 +15,40 @@
+ 
+ #include "zstd_compress_internal.h"
+ 
++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
+ /* used in ZSTD_loadDictionaryContent() */
+ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
++#endif
+ 
++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+ size_t ZSTD_compressBlock_btopt(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_btultra(
++size_t ZSTD_compressBlock_btopt_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-size_t ZSTD_compressBlock_btultra2(
++size_t ZSTD_compressBlock_btopt_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ 
++#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt
++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict
++#else
++#define ZSTD_COMPRESSBLOCK_BTOPT NULL
++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL
++#endif
+ 
+-size_t ZSTD_compressBlock_btopt_dictMatchState(
++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
++size_t ZSTD_compressBlock_btultra(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btultra_dictMatchState(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+-
+-size_t ZSTD_compressBlock_btopt_extDict(
+-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+-        void const* src, size_t srcSize);
+ size_t ZSTD_compressBlock_btultra_extDict(
+         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+         void const* src, size_t srcSize);
+@@ -45,6 +56,20 @@ size_t ZSTD_compressBlock_btultra_extDict(
+         /* note : no btultra2 variant for extDict nor dictMatchState,
+          * because btultra2 is not meant to work with dictionaries
+          * and is only specific for the first block (no prefix) */
++size_t ZSTD_compressBlock_btultra2(
++        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
++        void const* src, size_t srcSize);
++
++#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra
++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState
++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict
++#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2
++#else
++#define ZSTD_COMPRESSBLOCK_BTULTRA NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL
++#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL
++#endif
+ 
+ 
+ #endif /* ZSTD_OPT_H */
 diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
-index 60958afebc41..db670d71fdab 100644
+index 60958afebc41..ac8b87f48f84 100644
 --- a/lib/zstd/decompress/huf_decompress.c
 +++ b/lib/zstd/decompress/huf_decompress.c
 @@ -1,7 +1,8 @@
@@ -20647,7 +29031,7 @@ index 60958afebc41..db670d71fdab 100644
          return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
      }
  
-@@ -134,15 +144,28 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+@@ -134,43 +144,66 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
      return dtd;
  }
  
@@ -20672,16 +29056,19 @@ index 60958afebc41..db670d71fdab 100644
 + * op [in/out] - The output pointers, must be updated to reflect what is written.
 + * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
 + * dt [in] - The decoding table.
-+ * ilimit [in] - The input limit, stop when any input pointer is below ilimit.
++ * ilowest [in] - The beginning of the valid range of the input. Decoders may read
++ *                down to this pointer. It may be below iend[0].
 + * oend [in] - The end of the output stream. op[3] must not cross oend.
 + * iend [in] - The end of each input stream. ip[i] may cross iend[i],
-+ *             as long as it is above ilimit, but that indicates corruption.
++ *             as long as it is above ilowest, but that indicates corruption.
 + */
  typedef struct {
      BYTE const* ip[4];
      BYTE* op[4];
-@@ -151,15 +174,17 @@ typedef struct {
-     BYTE const* ilimit;
+     U64 bits[4];
+     void const* dt;
+-    BYTE const* ilimit;
++    BYTE const* ilowest;
      BYTE* oend;
      BYTE const* iend[4];
 -} HUF_DecompressAsmArgs;
@@ -20703,9 +29090,12 @@ index 60958afebc41..db670d71fdab 100644
  {
      void const* dt = DTable + 1;
      U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
-@@ -168,9 +193,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
  
-     BYTE* const oend = (BYTE*)dst + dstSize;
+-    const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
++    const BYTE* const istart = (const BYTE*)src;
+ 
+-    BYTE* const oend = (BYTE*)dst + dstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
  
 -    /* The following condition is false on x32 platform,
 -     * but HUF_asm is not compatible with this ABI */
@@ -20715,10 +29105,15 @@ index 60958afebc41..db670d71fdab 100644
 +     */
 +    if (!MEM_isLittleEndian() || MEM_32bits())
 +        return 0;
++
++    /* Avoid nullptr addition */
++    if (dstSize == 0)
++        return 0;
++    assert(dst != NULL);
  
      /* strict minimum : jump table + 1 byte per stream */
      if (srcSize < 10)
-@@ -181,7 +208,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+@@ -181,11 +214,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
       * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
       */
      if (dtLog != HUF_DECODER_FAST_TABLELOG)
@@ -20727,23 +29122,28 @@ index 60958afebc41..db670d71fdab 100644
  
      /* Read the jump table. */
      {
-@@ -195,13 +222,13 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+-        const BYTE* const istart = (const BYTE*)src;
+         size_t const length1 = MEM_readLE16(istart);
+         size_t const length2 = MEM_readLE16(istart+2);
+         size_t const length3 = MEM_readLE16(istart+4);
+@@ -195,13 +227,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
          args->iend[2] = args->iend[1] + length2;
          args->iend[3] = args->iend[2] + length3;
  
 -        /* HUF_initDStream() requires this, and this small of an input
 +        /* HUF_initFastDStream() requires this, and this small of an input
           * won't benefit from the ASM loop anyways.
-          * length1 must be >= 16 so that ip[0] >= ilimit before the loop
-          * starts.
+-         * length1 must be >= 16 so that ip[0] >= ilimit before the loop
+-         * starts.
           */
-         if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
+-        if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
 -            return 1;
++        if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
 +            return 0;
          if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
      }
      /* ip[] contains the position that is currently loaded into bits[]. */
-@@ -218,7 +245,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+@@ -218,7 +248,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
  
      /* No point to call the ASM loop for tiny outputs. */
      if (args->op[3] >= oend)
@@ -20752,7 +29152,7 @@ index 60958afebc41..db670d71fdab 100644
  
      /* bits[] is the bit container.
          * It is read from the MSB down to the LSB.
-@@ -227,10 +254,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
+@@ -227,24 +257,25 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
          * set, so that CountTrailingZeros(bits[]) can be used
          * to count how many bits we've consumed.
          */
@@ -20760,14 +29160,24 @@ index 60958afebc41..db670d71fdab 100644
 -    args->bits[1] = HUF_initDStream(args->ip[1]);
 -    args->bits[2] = HUF_initDStream(args->ip[2]);
 -    args->bits[3] = HUF_initDStream(args->ip[3]);
+-
+-    /* If ip[] >= ilimit, it is guaranteed to be safe to
+-        * reload bits[]. It may be beyond its section, but is
+-        * guaranteed to be valid (>= istart).
+-        */
+-    args->ilimit = ilimit;
 +    args->bits[0] = HUF_initFastDStream(args->ip[0]);
 +    args->bits[1] = HUF_initFastDStream(args->ip[1]);
 +    args->bits[2] = HUF_initFastDStream(args->ip[2]);
 +    args->bits[3] = HUF_initFastDStream(args->ip[3]);
++
++    /* The decoders must be sure to never read beyond ilowest.
++     * This is lower than iend[0], but allowing decoders to read
++     * down to ilowest can allow an extra iteration or two in the
++     * fast loop.
++     */
++    args->ilowest = istart;
  
-     /* If ip[] >= ilimit, it is guaranteed to be safe to
-         * reload bits[]. It may be beyond its section, but is
-@@ -241,10 +268,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
      args->oend = oend;
      args->dt = dt;
  
@@ -20780,16 +29190,17 @@ index 60958afebc41..db670d71fdab 100644
  {
      /* Validate that we haven't overwritten. */
      if (args->op[stream] > segmentEnd)
-@@ -258,15 +285,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
+@@ -258,15 +289,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
          return ERROR(corruption_detected);
  
      /* Construct the BIT_DStream_t. */
 -    bit->bitContainer = MEM_readLE64(args->ip[stream]);
 -    bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
+-    bit->start = (const char*)args->iend[0];
 +    assert(sizeof(size_t) == 8);
 +    bit->bitContainer = MEM_readLEST(args->ip[stream]);
 +    bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
-     bit->start = (const char*)args->iend[0];
++    bit->start = (const char*)args->ilowest;
      bit->limitPtr = bit->start + sizeof(size_t);
      bit->ptr = (const char*)args->ip[stream];
  
@@ -20799,25 +29210,25 @@ index 60958afebc41..db670d71fdab 100644
 +
 +/* Calls X(N) for each stream 0, 1, 2, 3. */
 +#define HUF_4X_FOR_EACH_STREAM(X) \
-+    {                             \
-+        X(0)                      \
-+        X(1)                      \
-+        X(2)                      \
-+        X(3)                      \
-+    }
++    do {                          \
++        X(0);                     \
++        X(1);                     \
++        X(2);                     \
++        X(3);                     \
++    } while (0)
 +
 +/* Calls X(N, var) for each stream 0, 1, 2, 3. */
 +#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
-+    {                                           \
-+        X(0, (var))                             \
-+        X(1, (var))                             \
-+        X(2, (var))                             \
-+        X(3, (var))                             \
-+    }
++    do {                                        \
++        X(0, (var));                            \
++        X(1, (var));                            \
++        X(2, (var));                            \
++        X(3, (var));                            \
++    } while (0)
  
  
  #ifndef HUF_FORCE_DECOMPRESS_X2
-@@ -283,10 +328,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decodi
+@@ -283,10 +332,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decodi
  static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
      U64 D4;
      if (MEM_isLittleEndian()) {
@@ -20831,7 +29242,7 @@ index 60958afebc41..db670d71fdab 100644
      D4 *= 0x0001000100010001ULL;
      return D4;
  }
-@@ -329,13 +375,7 @@ typedef struct {
+@@ -329,13 +379,7 @@ typedef struct {
          BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
  } HUF_ReadDTableX1_Workspace;
  
@@ -20846,7 +29257,7 @@ index 60958afebc41..db670d71fdab 100644
  {
      U32 tableLog = 0;
      U32 nbSymbols = 0;
-@@ -350,7 +390,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+@@ -350,7 +394,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
      DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
      /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
  
@@ -20855,7 +29266,7 @@ index 60958afebc41..db670d71fdab 100644
      if (HUF_isError(iSize)) return iSize;
  
  
-@@ -377,9 +417,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+@@ -377,9 +421,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
       * rankStart[0] is not filled because there are no entries in the table for
       * weight 0.
       */
@@ -20867,7 +29278,7 @@ index 60958afebc41..db670d71fdab 100644
          int const unroll = 4;
          int const nLimit = (int)nbSymbols - unroll + 1;
          for (n=0; n<(int)tableLog+1; n++) {
-@@ -406,10 +445,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
+@@ -406,10 +449,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
       * We can switch based on the length to a different inner loop which is
       * optimized for that particular case.
       */
@@ -20881,7 +29292,34 @@ index 60958afebc41..db670d71fdab 100644
          for (w=1; w<tableLog+1; ++w) {
              int const symbolCount = wksp->rankVal[w];
              int const length = (1 << w) >> 1;
-@@ -519,7 +557,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
+@@ -483,15 +525,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog
+ }
+ 
+ #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
+-    *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
++    do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)  \
+-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+-        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
++#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)      \
++    do {                                            \
++        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
++            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
++    } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
+-    if (MEM_64bits()) \
+-        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
++#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr)      \
++    do {                                            \
++        if (MEM_64bits())                           \
++            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
++    } while (0)
+ 
+ HINT_INLINE size_t
+ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
+@@ -519,7 +565,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
      while (p < pEnd)
          HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
  
@@ -20890,7 +29328,16 @@ index 60958afebc41..db670d71fdab 100644
  }
  
  FORCE_INLINE_TEMPLATE size_t
-@@ -545,6 +583,10 @@ HUF_decompress1X1_usingDTable_internal_body(
+@@ -529,7 +575,7 @@ HUF_decompress1X1_usingDTable_internal_body(
+     const HUF_DTable* DTable)
+ {
+     BYTE* op = (BYTE*)dst;
+-    BYTE* const oend = op + dstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
+     const void* dtPtr = DTable + 1;
+     const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+     BIT_DStream_t bitD;
+@@ -545,6 +591,10 @@ HUF_decompress1X1_usingDTable_internal_body(
      return dstSize;
  }
  
@@ -20901,15 +29348,23 @@ index 60958afebc41..db670d71fdab 100644
  FORCE_INLINE_TEMPLATE size_t
  HUF_decompress4X1_usingDTable_internal_body(
            void* dst,  size_t dstSize,
-@@ -588,6 +630,7 @@ HUF_decompress4X1_usingDTable_internal_body(
+@@ -553,6 +603,7 @@ HUF_decompress4X1_usingDTable_internal_body(
+ {
+     /* Check */
+     if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
++    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+ 
+     {   const BYTE* const istart = (const BYTE*) cSrc;
+         BYTE* const ostart = (BYTE*) dst;
+@@ -588,6 +639,7 @@ HUF_decompress4X1_usingDTable_internal_body(
  
          if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
          if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
-+        if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
++        assert(dstSize >= 6); /* validated above */
          CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
          CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
          CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
-@@ -650,38 +693,156 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+@@ -650,52 +702,173 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
  }
  #endif
  
@@ -20936,7 +29391,7 @@ index 60958afebc41..db670d71fdab 100644
 +    BYTE* op[4];
 +    U16 const* const dtable = (U16 const*)args->dt;
 +    BYTE* const oend = args->oend;
-+    BYTE const* const ilimit = args->ilimit;
++    BYTE const* const ilowest = args->ilowest;
 +
 +    /* Copy the arguments to local variables */
 +    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
@@ -20954,7 +29409,7 @@ index 60958afebc41..db670d71fdab 100644
 +#ifndef NDEBUG
 +        for (stream = 0; stream < 4; ++stream) {
 +            assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
-+            assert(ip[stream] >= ilimit);
++            assert(ip[stream] >= ilowest);
 +        }
 +#endif
 +        /* Compute olimit */
@@ -20964,7 +29419,7 @@ index 60958afebc41..db670d71fdab 100644
 +            /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
 +             * per stream.
 +             */
-+            size_t const iiters = (size_t)(ip[0] - ilimit) / 7;
++            size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
 +            /* We can safely run iters iterations before running bounds checks */
 +            size_t const iters = MIN(oiters, iiters);
 +            size_t const symbols = iters * 5;
@@ -20975,8 +29430,8 @@ index 60958afebc41..db670d71fdab 100644
 +             */
 +            olimit = op[3] + symbols;
 +
-+            /* Exit fast decoding loop once we get close to the end. */
-+            if (op[3] + 20 > olimit)
++            /* Exit fast decoding loop once we reach the end. */
++            if (op[3] == olimit)
 +                break;
 +
 +            /* Exit the decoding loop if any input pointer has crossed the
@@ -20996,15 +29451,15 @@ index 60958afebc41..db670d71fdab 100644
 +#endif
 +
 +#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol)                 \
-+    {                                                           \
++    do {                                                        \
 +        int const index = (int)(bits[(_stream)] >> 53);         \
 +        int const entry = (int)dtable[index];                   \
 +        bits[(_stream)] <<= (entry & 0x3F);                     \
 +        op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
-+    }
++    } while (0)
 +
 +#define HUF_4X1_RELOAD_STREAM(_stream)                              \
-+    {                                                               \
++    do {                                                            \
 +        int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
 +        int const nbBits = ctz & 7;                                 \
 +        int const nbBytes = ctz >> 3;                               \
@@ -21012,30 +29467,30 @@ index 60958afebc41..db670d71fdab 100644
 +        ip[(_stream)] -= nbBytes;                                   \
 +        bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
 +        bits[(_stream)] <<= nbBits;                                 \
-+    }
++    } while (0)
 +
 +        /* Manually unroll the loop because compilers don't consistently
 +         * unroll the inner loops, which destroys performance.
 +         */
 +        do {
 +            /* Decode 5 symbols in each of the 4 streams */
-+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0)
-+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1)
-+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2)
-+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3)
-+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4)
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
 +
 +            /* Reload each of the 4 the bitstreams */
-+            HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM)
++            HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
 +        } while (op[3] < olimit);
 +
 +#undef HUF_4X1_DECODE_SYMBOL
 +#undef HUF_4X1_RELOAD_STREAM
 +    }
-+
-+_out:
  
 -static HUF_ASM_X86_64_BMI2_ATTRS
++_out:
++
 +    /* Save the final values of each of the state variables back to args. */
 +    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
 +    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
@@ -21058,14 +29513,16 @@ index 60958afebc41..db670d71fdab 100644
 +    HUF_DecompressFastLoopFn loopFn)
  {
      void const* dt = DTable + 1;
-     const BYTE* const iend = (const BYTE*)cSrc + 6;
-     BYTE* const oend = (BYTE*)dst + dstSize;
+-    const BYTE* const iend = (const BYTE*)cSrc + 6;
+-    BYTE* const oend = (BYTE*)dst + dstSize;
 -    HUF_DecompressAsmArgs args;
 -    {
 -        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
 -        FORWARD_IF_ERROR(ret, "Failed to init asm args");
 -        if (ret != 0)
 -            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
++    BYTE const* const ilowest = (BYTE const*)cSrc;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
 +    HUF_DecompressFastArgs args;
 +    {   size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
 +        FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
@@ -21073,14 +29530,30 @@ index 60958afebc41..db670d71fdab 100644
 +            return 0;
      }
  
-     assert(args.ip[0] >= args.ilimit);
+-    assert(args.ip[0] >= args.ilimit);
 -    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
++    assert(args.ip[0] >= args.ilowest);
 +    loopFn(&args);
  
-     /* Our loop guarantees that ip[] >= ilimit and that we haven't
+-    /* Our loop guarantees that ip[] >= ilimit and that we haven't
++    /* Our loop guarantees that ip[] >= ilowest and that we haven't
      * overwritten any op[].
-@@ -694,8 +855,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
-     (void)iend;
+     */
+-    assert(args.ip[0] >= iend);
+-    assert(args.ip[1] >= iend);
+-    assert(args.ip[2] >= iend);
+-    assert(args.ip[3] >= iend);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[1] >= ilowest);
++    assert(args.ip[2] >= ilowest);
++    assert(args.ip[3] >= ilowest);
+     assert(args.op[3] <= oend);
+-    (void)iend;
++
++    assert(ilowest == args.ilowest);
++    assert(ilowest + 6 == args.iend[0]);
++    (void)ilowest;
  
      /* finish bit streams one by one. */
 -    {
@@ -21089,7 +29562,7 @@ index 60958afebc41..db670d71fdab 100644
          BYTE* segmentEnd = (BYTE*)dst;
          int i;
          for (i = 0; i < 4; ++i) {
-@@ -712,97 +872,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+@@ -712,97 +885,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
      }
  
      /* decoded size */
@@ -21151,13 +29624,13 @@ index 60958afebc41..db670d71fdab 100644
 -    if (dtd.tableType != 0) return ERROR(GENERIC);
 -    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
 -}
--
+ 
 -size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
 -                                   const void* cSrc, size_t cSrcSize,
 -                                   void* workSpace, size_t wkspSize)
 -{
 -    const BYTE* ip = (const BYTE*) cSrc;
- 
+-
 -    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
 -    if (HUF_isError(hSize)) return hSize;
 -    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
@@ -21212,7 +29685,7 @@ index 60958afebc41..db670d71fdab 100644
  #endif /* HUF_FORCE_DECOMPRESS_X2 */
  
  
-@@ -985,7 +1107,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
+@@ -985,7 +1120,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
  
  static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
                             const sortedSymbol_t* sortedList,
@@ -21221,7 +29694,7 @@ index 60958afebc41..db670d71fdab 100644
                             const U32 nbBitsBaseline)
  {
      U32* const rankVal = rankValOrigin[0];
-@@ -1040,14 +1162,7 @@ typedef struct {
+@@ -1040,14 +1175,7 @@ typedef struct {
  
  size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
                         const void* src, size_t srcSize,
@@ -21237,7 +29710,7 @@ index 60958afebc41..db670d71fdab 100644
  {
      U32 tableLog, maxW, nbSymbols;
      DTableDesc dtd = HUF_getDTableDesc(DTable);
-@@ -1069,7 +1184,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+@@ -1069,7 +1197,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
      if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
      /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
  
@@ -21246,7 +29719,43 @@ index 60958afebc41..db670d71fdab 100644
      if (HUF_isError(iSize)) return iSize;
  
      /* check result */
-@@ -1240,6 +1355,11 @@ HUF_decompress1X2_usingDTable_internal_body(
+@@ -1159,15 +1287,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c
+ }
+ 
+ #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+-    ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++    do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+-        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)                     \
++    do {                                                           \
++        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12))                \
++            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
++    } while (0)
+ 
+-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+-    if (MEM_64bits()) \
+-        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
++#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr)                     \
++    do {                                                           \
++        if (MEM_64bits())                                          \
++            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
++    } while (0)
+ 
+ HINT_INLINE size_t
+ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+@@ -1227,7 +1359,7 @@ HUF_decompress1X2_usingDTable_internal_body(
+ 
+     /* decode */
+     {   BYTE* const ostart = (BYTE*) dst;
+-        BYTE* const oend = ostart + dstSize;
++        BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);
+         const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+         const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+         DTableDesc const dtd = HUF_getDTableDesc(DTable);
+@@ -1240,6 +1372,11 @@ HUF_decompress1X2_usingDTable_internal_body(
      /* decoded size */
      return dstSize;
  }
@@ -21258,7 +29767,15 @@ index 60958afebc41..db670d71fdab 100644
  FORCE_INLINE_TEMPLATE size_t
  HUF_decompress4X2_usingDTable_internal_body(
            void* dst,  size_t dstSize,
-@@ -1280,8 +1400,9 @@ HUF_decompress4X2_usingDTable_internal_body(
+@@ -1247,6 +1384,7 @@ HUF_decompress4X2_usingDTable_internal_body(
+     const HUF_DTable* DTable)
+ {
+     if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
++    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
+ 
+     {   const BYTE* const istart = (const BYTE*) cSrc;
+         BYTE* const ostart = (BYTE*) dst;
+@@ -1280,8 +1418,9 @@ HUF_decompress4X2_usingDTable_internal_body(
          DTableDesc const dtd = HUF_getDTableDesc(DTable);
          U32 const dtLog = dtd.tableLog;
  
@@ -21266,11 +29783,11 @@ index 60958afebc41..db670d71fdab 100644
 -        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
 +        if (length4 > cSrcSize) return ERROR(corruption_detected);  /* overflow */
 +        if (opStart4 > oend) return ERROR(corruption_detected);     /* overflow */
-+        if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
++        assert(dstSize >= 6 /* validated above */);
          CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
          CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
          CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
-@@ -1366,36 +1487,178 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
+@@ -1366,44 +1505,191 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
  }
  #endif
  
@@ -21297,7 +29814,7 @@ index 60958afebc41..db670d71fdab 100644
 +    BYTE* op[4];
 +    BYTE* oend[4];
 +    HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
-+    BYTE const* const ilimit = args->ilimit;
++    BYTE const* const ilowest = args->ilowest;
 +
 +    /* Copy the arguments to local registers. */
 +    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
@@ -21320,7 +29837,7 @@ index 60958afebc41..db670d71fdab 100644
 +#ifndef NDEBUG
 +        for (stream = 0; stream < 4; ++stream) {
 +            assert(op[stream] <= oend[stream]);
-+            assert(ip[stream] >= ilimit);
++            assert(ip[stream] >= ilowest);
 +        }
 +#endif
 +        /* Compute olimit */
@@ -21333,7 +29850,7 @@ index 60958afebc41..db670d71fdab 100644
 +             * We also know that each input pointer is >= ip[0]. So we can run
 +             * iters loops before running out of input.
 +             */
-+            size_t iters = (size_t)(ip[0] - ilimit) / 7;
++            size_t iters = (size_t)(ip[0] - ilowest) / 7;
 +            /* Each iteration can produce up to 10 bytes of output per stream.
 +             * Each output stream my advance at different rates. So take the
 +             * minimum number of safe iterations among all the output streams.
@@ -21351,8 +29868,8 @@ index 60958afebc41..db670d71fdab 100644
 +             */
 +            olimit = op[3] + (iters * 5);
 +
-+            /* Exit the fast decoding loop if we are too close to the end. */
-+            if (op[3] + 10 > olimit)
++            /* Exit the fast decoding loop once we reach the end. */
++            if (op[3] == olimit)
 +                break;
 +
 +            /* Exit the decoding loop if any input pointer has crossed the
@@ -21370,19 +29887,23 @@ index 60958afebc41..db670d71fdab 100644
 +            assert(ip[stream] >= ip[stream - 1]);
 +        }
 +#endif
-+
-+#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3)        \
-+    if ((_decode3) || (_stream) != 3) {                 \
-+        int const index = (int)(bits[(_stream)] >> 53); \
-+        HUF_DEltX2 const entry = dtable[index];         \
-+        MEM_write16(op[(_stream)], entry.sequence);     \
-+        bits[(_stream)] <<= (entry.nbBits) & 0x3F;      \
-+        op[(_stream)] += (entry.length);                \
-+    }
+ 
+-static HUF_ASM_X86_64_BMI2_ATTRS size_t
+-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
++#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3)                      \
++    do {                                                              \
++        if ((_decode3) || (_stream) != 3) {                           \
++            int const index = (int)(bits[(_stream)] >> 53);           \
++            HUF_DEltX2 const entry = dtable[index];                   \
++            MEM_write16(op[(_stream)], entry.sequence); \
++            bits[(_stream)] <<= (entry.nbBits) & 0x3F;                \
++            op[(_stream)] += (entry.length);                          \
++        }                                                             \
++    } while (0)
 +
 +#define HUF_4X2_RELOAD_STREAM(_stream)                                  \
-+    {                                                                   \
-+        HUF_4X2_DECODE_SYMBOL(3, 1)                                     \
++    do {                                                                \
++        HUF_4X2_DECODE_SYMBOL(3, 1);                                    \
 +        {                                                               \
 +            int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
 +            int const nbBits = ctz & 7;                                 \
@@ -21391,7 +29912,7 @@ index 60958afebc41..db670d71fdab 100644
 +            bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
 +            bits[(_stream)] <<= nbBits;                                 \
 +        }                                                               \
-+    }
++    } while (0)
 +
 +        /* Manually unroll the loop because compilers don't consistently
 +         * unroll the inner loops, which destroys performance.
@@ -21401,25 +29922,23 @@ index 60958afebc41..db670d71fdab 100644
 +             * The final stream will be decoded during the reload phase
 +             * to reduce register pressure.
 +             */
-+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
-+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
-+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
-+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
-+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0)
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
++            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
 +
 +            /* Decode one symbol from the final stream */
-+            HUF_4X2_DECODE_SYMBOL(3, 1)
++            HUF_4X2_DECODE_SYMBOL(3, 1);
 +
 +            /* Decode 4 symbols from the final stream & reload bitstreams.
 +             * The final stream is reloaded last, meaning that all 5 symbols
 +             * are decoded from the final stream before it is reloaded.
 +             */
-+            HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM)
++            HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
 +        } while (op[3] < olimit);
 +    }
- 
--static HUF_ASM_X86_64_BMI2_ATTRS size_t
--HUF_decompress4X2_usingDTable_internal_bmi2_asm(
++
 +#undef HUF_4X2_DECODE_SYMBOL
 +#undef HUF_4X2_RELOAD_STREAM
 +
@@ -21440,9 +29959,11 @@ index 60958afebc41..db670d71fdab 100644
 +    const HUF_DTable* DTable,
 +    HUF_DecompressFastLoopFn loopFn) {
      void const* dt = DTable + 1;
-     const BYTE* const iend = (const BYTE*)cSrc + 6;
-     BYTE* const oend = (BYTE*)dst + dstSize;
+-    const BYTE* const iend = (const BYTE*)cSrc + 6;
+-    BYTE* const oend = (BYTE*)dst + dstSize;
 -    HUF_DecompressAsmArgs args;
++    const BYTE* const ilowest = (const BYTE*)cSrc;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
 +    HUF_DecompressFastArgs args;
      {
 -        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
@@ -21454,13 +29975,30 @@ index 60958afebc41..db670d71fdab 100644
 +            return 0;
      }
  
-     assert(args.ip[0] >= args.ilimit);
+-    assert(args.ip[0] >= args.ilimit);
 -    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
++    assert(args.ip[0] >= args.ilowest);
 +    loopFn(&args);
  
      /* note : op4 already verified within main loop */
-     assert(args.ip[0] >= iend);
-@@ -1426,91 +1689,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+-    assert(args.ip[0] >= iend);
+-    assert(args.ip[1] >= iend);
+-    assert(args.ip[2] >= iend);
+-    assert(args.ip[3] >= iend);
++    assert(args.ip[0] >= ilowest);
++    assert(args.ip[1] >= ilowest);
++    assert(args.ip[2] >= ilowest);
++    assert(args.ip[3] >= ilowest);
+     assert(args.op[3] <= oend);
+-    (void)iend;
++
++    assert(ilowest == args.ilowest);
++    assert(ilowest + 6 == args.iend[0]);
++    (void)ilowest;
+ 
+     /* finish bitStreams one by one */
+     {
+@@ -1426,91 +1712,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
      /* decoded size */
      return dstSize;
  }
@@ -21580,7 +30118,7 @@ index 60958afebc41..db670d71fdab 100644
  #endif /* HUF_FORCE_DECOMPRESS_X1 */
  
  
-@@ -1518,44 +1762,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+@@ -1518,44 +1785,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
  /* Universal decompression selectors */
  /* ***********************************/
  
@@ -21625,7 +30163,7 @@ index 60958afebc41..db670d71fdab 100644
  
  #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
  typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
-@@ -1610,36 +1816,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+@@ -1610,36 +1839,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
  #endif
  }
  
@@ -21663,7 +30201,7 @@ index 60958afebc41..db670d71fdab 100644
  {
      /* validation checks */
      if (dstSize == 0) return ERROR(dstSize_tooSmall);
-@@ -1652,71 +1831,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+@@ -1652,71 +1854,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
          (void)algoNb;
          assert(algoNb == 0);
          return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
@@ -21753,7 +30291,7 @@ index 60958afebc41..db670d71fdab 100644
  {
      /* validation checks */
      if (dstSize == 0) return ERROR(dstSize_tooSmall);
-@@ -1726,15 +1905,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
+@@ -1726,15 +1928,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
  #if defined(HUF_FORCE_DECOMPRESS_X1)
          (void)algoNb;
          assert(algoNb == 0);
@@ -21828,7 +30366,7 @@ index 8c1a79d666f8..de459a0dacd1 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
-index 6b3177c94711..4b3b88715f18 100644
+index 6b3177c94711..c9cbc45f6ed9 100644
 --- a/lib/zstd/decompress/zstd_decompress.c
 +++ b/lib/zstd/decompress/zstd_decompress.c
 @@ -1,5 +1,6 @@
@@ -21839,27 +30377,25 @@ index 6b3177c94711..4b3b88715f18 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
-@@ -52,17 +53,18 @@
- /*-*******************************************************
+@@ -53,13 +54,15 @@
  *  Dependencies
  *********************************************************/
-+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
  #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
++#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
++#include "../common/error_private.h"
++#include "../common/zstd_internal.h"  /* blockProperties_t */
  #include "../common/mem.h"         /* low level memory routines */
++#include "../common/bits.h"  /* ZSTD_highbit32 */
  #define FSE_STATIC_LINKING_ONLY
  #include "../common/fse.h"
 -#define HUF_STATIC_LINKING_ONLY
  #include "../common/huf.h"
  #include <linux/xxhash.h> /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */
- #include "../common/zstd_internal.h"  /* blockProperties_t */
+-#include "../common/zstd_internal.h"  /* blockProperties_t */
  #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
  #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
  #include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
-+#include "../common/bits.h"  /* ZSTD_highbit32 */
- 
- 
- 
-@@ -72,11 +74,11 @@
+@@ -72,11 +75,11 @@
   *************************************/
  
  #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
@@ -21876,15 +30412,24 @@ index 6b3177c94711..4b3b88715f18 100644
  
  #define DDICT_HASHSET_TABLE_BASE_SIZE 64
  #define DDICT_HASHSET_RESIZE_FACTOR 2
-@@ -237,6 +239,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
+@@ -237,6 +240,8 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
      dctx->outBufferMode = ZSTD_bm_buffered;
      dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
      dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
 +    dctx->disableHufAsm = 0;
++    dctx->maxBlockSizeParam = 0;
  }
  
  static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
-@@ -421,16 +424,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+@@ -253,6 +258,7 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+     dctx->streamStage = zdss_init;
+     dctx->noForwardProgress = 0;
+     dctx->oversizedDuration = 0;
++    dctx->isFrameDecompression = 1;
+ #if DYNAMIC_BMI2
+     dctx->bmi2 = ZSTD_cpuSupportsBmi2();
+ #endif
+@@ -421,16 +427,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
   *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
   * @return : 0, `zfhPtr` is correctly filled,
   *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
@@ -21929,7 +30474,7 @@ index 6b3177c94711..4b3b88715f18 100644
      if ( (format != ZSTD_f_zstd1_magicless)
        && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
          if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
-@@ -540,61 +567,62 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)
+@@ -540,61 +570,62 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)
      sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
      RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
                      frameParameter_unsupported, "");
@@ -22018,7 +30563,7 @@ index 6b3177c94711..4b3b88715f18 100644
              assert(skippableSize <= srcSize);
  
              src = (const BYTE *)src + skippableSize;
-@@ -602,17 +630,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+@@ -602,17 +633,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
              continue;
          }
  
@@ -22044,7 +30589,32 @@ index 6b3177c94711..4b3b88715f18 100644
  
              src = (const BYTE *)src + frameSrcSize;
              srcSize -= frameSrcSize;
-@@ -730,10 +758,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
+@@ -676,13 +707,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret)
+     return frameSizeInfo;
+ }
+ 
+-static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize)
++static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format)
+ {
+     ZSTD_frameSizeInfo frameSizeInfo;
+     ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo));
+ 
+ 
+-    if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
++    if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
+         && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+         frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize);
+         assert(ZSTD_isError(frameSizeInfo.compressedSize) ||
+@@ -696,7 +727,7 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
+         ZSTD_frameHeader zfh;
+ 
+         /* Extract Frame Header */
+-        {   size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize);
++        {   size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format);
+             if (ZSTD_isError(ret))
+                 return ZSTD_errorFrameSizeInfo(ret);
+             if (ret > 0)
+@@ -730,23 +761,26 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
              ip += 4;
          }
  
@@ -22057,7 +30627,37 @@ index 6b3177c94711..4b3b88715f18 100644
          return frameSizeInfo;
      }
  }
-@@ -773,6 +802,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+ 
++static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) {
++    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format);
++    return frameSizeInfo.compressedSize;
++}
++
+ /* ZSTD_findFrameCompressedSize() :
+- *  compatible with legacy mode
+- *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+- *  `srcSize` must be at least as large as the frame contained
+- *  @return : the compressed size of the frame starting at `src` */
++ * See docs in zstd.h
++ * Note: compatible with legacy mode */
+ size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+ {
+-    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+-    return frameSizeInfo.compressedSize;
++    return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1);
+ }
+ 
+ /* ZSTD_decompressBound() :
+@@ -760,7 +794,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+     unsigned long long bound = 0;
+     /* Iterate over each frame */
+     while (srcSize > 0) {
+-        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
++        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
+         size_t const compressedSize = frameSizeInfo.compressedSize;
+         unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+         if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+@@ -773,6 +807,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
      return bound;
  }
  
@@ -22068,7 +30668,7 @@ index 6b3177c94711..4b3b88715f18 100644
 +
 +    /* Iterate over each frame */
 +    while (srcSize > 0) {
-+        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
++        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
 +        size_t const compressedSize = frameSizeInfo.compressedSize;
 +        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
 +        ZSTD_frameHeader zfh;
@@ -22106,7 +30706,47 @@ index 6b3177c94711..4b3b88715f18 100644
  
  /*-*************************************************************
   *   Frame decoding
-@@ -930,6 +1001,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+@@ -856,6 +932,10 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize;
+     }
+ 
++    /* Shrink the blockSizeMax if enabled */
++    if (dctx->maxBlockSizeParam != 0)
++        dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam);
++
+     /* Loop on each block */
+     while (1) {
+         BYTE* oBlockEnd = oend;
+@@ -888,7 +968,8 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         switch(blockProperties.blockType)
+         {
+         case bt_compressed:
+-            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming);
++            assert(dctx->isFrameDecompression == 1);
++            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming);
+             break;
+         case bt_raw :
+             /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */
+@@ -901,12 +982,14 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+         default:
+             RETURN_ERROR(corruption_detected, "invalid block type");
+         }
+-
+-        if (ZSTD_isError(decodedSize)) return decodedSize;
+-        if (dctx->validateChecksum)
++        FORWARD_IF_ERROR(decodedSize, "Block decompression failure");
++        DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize);
++        if (dctx->validateChecksum) {
+             xxh64_update(&dctx->xxhState, op, decodedSize);
+-        if (decodedSize != 0)
++        }
++        if (decodedSize) /* support dst = NULL,0 */ {
+             op += decodedSize;
++        }
+         assert(ip != NULL);
+         ip += cBlockSize;
+         remainingSrcSize -= cBlockSize;
+@@ -930,12 +1013,15 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
      }
      ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
      /* Allow caller to get size read */
@@ -22114,14 +30754,23 @@ index 6b3177c94711..4b3b88715f18 100644
      *srcPtr = ip;
      *srcSizePtr = remainingSrcSize;
      return (size_t)(op-ostart);
-@@ -955,17 +1027,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+ }
+ 
+-static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
++static
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
++size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+                                         void* dst, size_t dstCapacity,
+                                   const void* src, size_t srcSize,
+                                   const void* dict, size_t dictSize,
+@@ -955,17 +1041,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
      while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
  
  
 -        {   U32 const magicNumber = MEM_readLE32(src);
 -            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
 -                        (unsigned)magicNumber, ZSTD_MAGICNUMBER);
-+        if (srcSize >= 4) {
++        if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) {
 +            U32 const magicNumber = MEM_readLE32(src);
 +            DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
              if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
@@ -22138,7 +30787,7 @@ index 6b3177c94711..4b3b88715f18 100644
          }   }
  
          if (ddict) {
-@@ -1061,8 +1134,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
+@@ -1061,8 +1148,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
  size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
  
  /*
@@ -22149,7 +30798,25 @@ index 6b3177c94711..4b3b88715f18 100644
   * be streamed.
   *
   * For blocks that can be streamed, this allows us to reduce the latency until we produce
-@@ -1262,7 +1335,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+@@ -1181,7 +1268,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+             {
+             case bt_compressed:
+                 DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
+-                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming);
++                assert(dctx->isFrameDecompression == 1);
++                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming);
+                 dctx->expected = 0;  /* Streaming not supported */
+                 break;
+             case bt_raw :
+@@ -1250,6 +1338,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
+     case ZSTDds_decodeSkippableHeader:
+         assert(src != NULL);
+         assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE);
++        assert(dctx->format != ZSTD_f_zstd1_magicless);
+         ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize);   /* complete skippable header */
+         dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+         dctx->stage = ZSTDds_skipFrame;
+@@ -1262,7 +1351,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
  
      default:
          assert(0);   /* impossible */
@@ -22158,7 +30825,7 @@ index 6b3177c94711..4b3b88715f18 100644
      }
  }
  
-@@ -1303,11 +1376,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+@@ -1303,11 +1392,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
          /* in minimal huffman, we always use X1 variants */
          size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
                                                  dictPtr, dictEnd - dictPtr,
@@ -22172,7 +30839,7 @@ index 6b3177c94711..4b3b88715f18 100644
  #endif
          RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
          dictPtr += hSize;
-@@ -1403,7 +1476,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+@@ -1403,10 +1492,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
      dctx->prefixStart = NULL;
      dctx->virtualStart = NULL;
      dctx->dictEnd = NULL;
@@ -22181,7 +30848,11 @@ index 6b3177c94711..4b3b88715f18 100644
      dctx->litEntropy = dctx->fseEntropy = 0;
      dctx->dictID = 0;
      dctx->bType = bt_reserved;
-@@ -1465,7 +1538,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
++    dctx->isFrameDecompression = 1;
+     ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+     ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue));  /* initial repcodes */
+     dctx->LLTptr = dctx->entropy.LLTable;
+@@ -1465,7 +1555,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
   *  This could for one of the following reasons :
   *  - The frame does not require a dictionary (most common case).
   *  - The frame was built with dictID intentionally removed.
@@ -22190,7 +30861,7 @@ index 6b3177c94711..4b3b88715f18 100644
   *    Note : this use case also happens when using a non-conformant dictionary.
   *  - `srcSize` is too small, and as a result, frame header could not be decoded.
   *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
-@@ -1474,7 +1547,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+@@ -1474,7 +1564,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
   *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
  unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
  {
@@ -22199,7 +30870,7 @@ index 6b3177c94711..4b3b88715f18 100644
      size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
      if (ZSTD_isError(hError)) return 0;
      return zfp.dictID;
-@@ -1581,7 +1654,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
+@@ -1581,7 +1671,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
  size_t ZSTD_initDStream(ZSTD_DStream* zds)
  {
      DEBUGLOG(4, "ZSTD_initDStream");
@@ -22210,7 +30881,7 @@ index 6b3177c94711..4b3b88715f18 100644
  }
  
  /* ZSTD_initDStream_usingDDict() :
-@@ -1589,6 +1664,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
+@@ -1589,6 +1681,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
   * this function cannot fail */
  size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
  {
@@ -22218,7 +30889,7 @@ index 6b3177c94711..4b3b88715f18 100644
      FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
      FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
      return ZSTD_startingInputLength(dctx->format);
-@@ -1599,6 +1675,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+@@ -1599,6 +1692,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
   * this function cannot fail */
  size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
  {
@@ -22226,7 +30897,7 @@ index 6b3177c94711..4b3b88715f18 100644
      FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
      return ZSTD_startingInputLength(dctx->format);
  }
-@@ -1670,6 +1747,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
+@@ -1670,6 +1764,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
              bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
              bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
              return bounds;
@@ -22234,32 +30905,85 @@ index 6b3177c94711..4b3b88715f18 100644
 +            bounds.lowerBound = 0;
 +            bounds.upperBound = 1;
 +            return bounds;
++        case ZSTD_d_maxBlockSize:
++            bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
++            bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
++            return bounds;
 +
          default:;
      }
      bounds.error = ERROR(parameter_unsupported);
-@@ -1710,6 +1792,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
+@@ -1710,6 +1813,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
          case ZSTD_d_refMultipleDDicts:
              *value = (int)dctx->refMultipleDDicts;
              return 0;
 +        case ZSTD_d_disableHuffmanAssembly:
 +            *value = (int)dctx->disableHufAsm;
++            return 0;
++        case ZSTD_d_maxBlockSize:
++            *value = dctx->maxBlockSizeParam;
 +            return 0;
          default:;
      }
      RETURN_ERROR(parameter_unsupported, "");
-@@ -1743,6 +1828,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
+@@ -1743,6 +1852,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
              }
              dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
              return 0;
 +        case ZSTD_d_disableHuffmanAssembly:
 +            CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
 +            dctx->disableHufAsm = value != 0;
++            return 0;
++        case ZSTD_d_maxBlockSize:
++            if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value);
++            dctx->maxBlockSizeParam = value;
 +            return 0;
          default:;
      }
      RETURN_ERROR(parameter_unsupported, "");
-@@ -1918,7 +2007,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -1754,6 +1871,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+         dctx->streamStage = zdss_init;
+         dctx->noForwardProgress = 0;
++        dctx->isFrameDecompression = 1;
+     }
+     if ( (reset == ZSTD_reset_parameters)
+       || (reset == ZSTD_reset_session_and_parameters) ) {
+@@ -1770,11 +1888,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
+     return ZSTD_sizeof_DCtx(dctx);
+ }
+ 
+-size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
++static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax)
+ {
+-    size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+-    /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/
+-    unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2);
++    size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax);
++    /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block
++     * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing
++     * the block at the beginning of the output buffer, and maintain a full window.
++     *
++     * We need another blockSize worth of buffer so that we can store split
++     * literals at the end of the block without overwriting the extDict window.
++     */
++    unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2);
+     unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
+     size_t const minRBSize = (size_t) neededSize;
+     RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize,
+@@ -1782,6 +1906,11 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
+     return minRBSize;
+ }
+ 
++size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
++{
++    return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX);
++}
++
+ size_t ZSTD_estimateDStreamSize(size_t windowSize)
+ {
+     size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+@@ -1918,7 +2047,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                  if (zds->refMultipleDDicts && zds->ddictSet) {
                      ZSTD_DCtx_selectFrameDDict(zds);
                  }
@@ -22267,7 +30991,7 @@ index 6b3177c94711..4b3b88715f18 100644
                  if (ZSTD_isError(hSize)) {
                      return hSize;   /* error */
                  }
-@@ -1932,6 +2020,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -1932,6 +2060,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                              zds->lhSize += remainingInput;
                          }
                          input->pos = input->size;
@@ -22279,10 +31003,18 @@ index 6b3177c94711..4b3b88715f18 100644
                          return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
                      }
                      assert(ip != NULL);
-@@ -1949,8 +2042,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -1943,14 +2076,15 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                 && zds->fParams.frameType != ZSTD_skippableFrame
+                 && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
+-                size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart));
++                size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format);
+                 if (cSize <= (size_t)(iend-istart)) {
+                     /* shortcut : using single-pass mode */
                      size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
                      if (ZSTD_isError(decompressedSize)) return decompressedSize;
-                     DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
+-                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
++                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()");
 +                    assert(istart != NULL);
                      ip = istart + cSize;
 -                    op += decompressedSize;
@@ -22290,7 +31022,32 @@ index 6b3177c94711..4b3b88715f18 100644
                      zds->expected = 0;
                      zds->streamStage = zdss_init;
                      someMoreWork = 0;
-@@ -2034,6 +2128,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -1969,7 +2103,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             DEBUGLOG(4, "Consume header");
+             FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), "");
+ 
+-            if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
++            if (zds->format == ZSTD_f_zstd1
++                && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
+                 zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
+                 zds->stage = ZSTDds_skipFrame;
+             } else {
+@@ -1985,11 +2120,13 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+             zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+             RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize,
+                             frameParameter_windowTooLarge, "");
++            if (zds->maxBlockSizeParam != 0)
++                zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam);
+ 
+             /* Adapt buffer sizes to frame header instructions */
+             {   size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
+                 size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered
+-                        ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize)
++                        ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax)
+                         : 0;
+ 
+                 ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize);
+@@ -2034,6 +2171,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                  }
                  if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
                      FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
@@ -22298,7 +31055,7 @@ index 6b3177c94711..4b3b88715f18 100644
                      ip += neededInSize;
                      /* Function modifies the stage so we must break */
                      break;
-@@ -2048,7 +2143,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -2048,7 +2186,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                  int const isSkipFrame = ZSTD_isSkipFrame(zds);
                  size_t loadedSize;
                  /* At this point we shouldn't be decompressing a block that we can stream. */
@@ -22307,7 +31064,7 @@ index 6b3177c94711..4b3b88715f18 100644
                  if (isSkipFrame) {
                      loadedSize = MIN(toLoad, (size_t)(iend-ip));
                  } else {
-@@ -2057,8 +2152,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -2057,8 +2195,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                                      "should never happen");
                      loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
                  }
@@ -22321,7 +31078,7 @@ index 6b3177c94711..4b3b88715f18 100644
                  if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
  
                  /* decode loaded input */
-@@ -2068,14 +2166,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -2068,14 +2209,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                  break;
              }
          case zdss_flush:
@@ -22342,7 +31099,7 @@ index 6b3177c94711..4b3b88715f18 100644
                          DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
                                  (int)(zds->outBuffSize - zds->outStart),
                                  (U32)zds->fParams.blockSizeMax);
-@@ -2089,7 +2190,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -2089,7 +2233,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
  
          default:
              assert(0);    /* impossible */
@@ -22351,7 +31108,7 @@ index 6b3177c94711..4b3b88715f18 100644
      }   }
  
      /* result */
-@@ -2102,8 +2203,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
+@@ -2102,8 +2246,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
      if ((ip==istart) && (op==ostart)) {  /* no forward progress */
          zds->noForwardProgress ++;
          if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
@@ -22362,7 +31119,7 @@ index 6b3177c94711..4b3b88715f18 100644
              assert(0);
          }
      } else {
-@@ -2140,11 +2241,17 @@ size_t ZSTD_decompressStream_simpleArgs (
+@@ -2140,11 +2284,17 @@ size_t ZSTD_decompressStream_simpleArgs (
                              void* dst, size_t dstCapacity, size_t* dstPos,
                        const void* src, size_t srcSize, size_t* srcPos)
  {
@@ -22388,7 +31145,7 @@ index 6b3177c94711..4b3b88715f18 100644
 +    }
  }
 diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
-index c1913b8e7c89..9f5577e5bc19 100644
+index c1913b8e7c89..9fe9a12c8a2c 100644
 --- a/lib/zstd/decompress/zstd_decompress_block.c
 +++ b/lib/zstd/decompress/zstd_decompress_block.c
 @@ -1,5 +1,6 @@
@@ -22413,16 +31170,110 @@ index c1913b8e7c89..9f5577e5bc19 100644
  
  /*_*******************************************************
  *  Macros
-@@ -89,7 +90,7 @@ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const
+@@ -51,6 +52,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
+  *   Block decoding
+  ***************************************************************/
+ 
++static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx)
++{
++    size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX;
++    assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX);
++    return blockSizeMax;
++}
++
+ /*! ZSTD_getcBlockSize() :
+  *  Provides the size of compressed block from block header `src` */
+ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+@@ -73,41 +81,49 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
+     const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
+ {
+-    if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
+-    {
+-        /* room for litbuffer to fit without read faulting */
+-        dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
++    size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
++    assert(litSize <= blockSizeMax);
++    assert(dctx->isFrameDecompression || streaming == not_streaming);
++    assert(expectedWriteSize <= blockSizeMax);
++    if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) {
++        /* If we aren't streaming, we can just put the literals after the output
++         * of the current block. We don't need to worry about overwriting the
++         * extDict of our window, because it doesn't exist.
++         * So if we have space after the end of the block, just put it there.
++         */
++        dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH;
+         dctx->litBufferEnd = dctx->litBuffer + litSize;
+         dctx->litBufferLocation = ZSTD_in_dst;
+-    }
+-    else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
+-    {
+-        /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
++    } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) {
++        /* Literals fit entirely within the extra buffer, put them there to avoid
++         * having to split the literals.
++         */
++        dctx->litBuffer = dctx->litExtraBuffer;
++        dctx->litBufferEnd = dctx->litBuffer + litSize;
++        dctx->litBufferLocation = ZSTD_not_in_dst;
++    } else {
++        assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE);
++        /* Literals must be split between the output block and the extra lit
++         * buffer. We fill the extra lit buffer with the tail of the literals,
++         * and put the rest of the literals at the end of the block, with
++         * WILDCOPY_OVERLENGTH of buffer room to allow for overreads.
++         * This MUST not write more than our maxBlockSize beyond dst, because in
++         * streaming mode, that could overwrite part of our extDict window.
++         */
+         if (splitImmediately) {
+             /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
+             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
              dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
-         }
-         else {
+-        }
+-        else {
 -            /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
++        } else {
 +            /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
              dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
              dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
          }
-@@ -134,13 +135,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+         dctx->litBufferLocation = ZSTD_split;
+-    }
+-    else
+-    {
+-        /* fits entirely within litExtraBuffer, so no split is necessary */
+-        dctx->litBuffer = dctx->litExtraBuffer;
+-        dctx->litBufferEnd = dctx->litBuffer + litSize;
+-        dctx->litBufferLocation = ZSTD_not_in_dst;
++        assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize);
+     }
+ }
+ 
+-/* Hidden declaration for fullbench */
+-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+-                          const void* src, size_t srcSize,
+-                          void* dst, size_t dstCapacity, const streaming_operation streaming);
+ /*! ZSTD_decodeLiteralsBlock() :
+  * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
+  * in the dstBuffer.  If there is room to do so, it will be stored in full in the excess dst space after where the current
+@@ -116,7 +132,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+  *
+  * @return : nb of bytes read from src (< srcSize )
+  *  note : symbol not declared but exposed for fullbench */
+-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
++static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                           const void* src, size_t srcSize,   /* note : srcSize < BLOCKSIZE */
+                           void* dst, size_t dstCapacity, const streaming_operation streaming)
+ {
+@@ -125,6 +141,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ 
+     {   const BYTE* const istart = (const BYTE*) src;
+         symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
++        size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
+ 
+         switch(litEncType)
+         {
+@@ -134,13 +151,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
              ZSTD_FALLTHROUGH;
  
          case set_compressed:
@@ -22433,17 +31284,20 @@ index c1913b8e7c89..9f5577e5bc19 100644
                  U32 const lhlCode = (istart[0] >> 2) & 3;
                  U32 const lhc = MEM_readLE32(istart);
                  size_t hufSuccess;
-                 size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
 +                int const flags = 0
 +                    | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
 +                    | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
                  switch(lhlCode)
                  {
                  case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
-@@ -165,6 +169,10 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+@@ -164,7 +184,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                     break;
                  }
                  RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
-                 RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
+-                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
 +                if (!singleStream)
 +                    RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
 +                        "Not enough literals (%zu) for the 4-streams mode (min %u)",
@@ -22451,7 +31305,7 @@ index c1913b8e7c89..9f5577e5bc19 100644
                  RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
                  RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
                  ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
-@@ -176,13 +184,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+@@ -176,13 +200,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
  
                  if (litEncType==set_repeat) {
                      if (singleStream) {
@@ -22470,7 +31324,7 @@ index c1913b8e7c89..9f5577e5bc19 100644
                      }
                  } else {
                      if (singleStream) {
-@@ -190,18 +199,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+@@ -190,26 +215,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                          hufSuccess = HUF_decompress1X_DCtx_wksp(
                              dctx->entropy.hufTable, dctx->litBuffer, litSize,
                              istart+lhSize, litCSize, dctx->workspace,
@@ -22494,7 +31348,26 @@ index c1913b8e7c89..9f5577e5bc19 100644
                      }
                  }
                  if (dctx->litBufferLocation == ZSTD_split)
-@@ -237,6 +246,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                 {
++                    assert(litSize > ZSTD_LITBUFFEREXTRASIZE);
+                     ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
+                     ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
+                     dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+                     dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
++                    assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax);
+                 }
+ 
+                 RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
+@@ -224,7 +251,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+         case set_basic:
+             {   size_t litSize, lhSize;
+                 U32 const lhlCode = ((istart[0]) >> 2) & 3;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                 switch(lhlCode)
+                 {
+                 case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -237,11 +264,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                      break;
                  case 3:
                      lhSize = 3;
@@ -22502,7 +31375,22 @@ index c1913b8e7c89..9f5577e5bc19 100644
                      litSize = MEM_readLE24(istart) >> 4;
                      break;
                  }
-@@ -279,12 +289,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+ 
+                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                 if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+@@ -270,7 +299,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+         case set_rle:
+             {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                 size_t litSize, lhSize;
+-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
++                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                 switch(lhlCode)
+                 {
+                 case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+@@ -279,16 +308,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                      break;
                  case 1:
                      lhSize = 2;
@@ -22517,7 +31405,31 @@ index c1913b8e7c89..9f5577e5bc19 100644
                      break;
                  }
                  RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
-@@ -506,14 +517,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+-                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
++                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
+                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
+                 if (dctx->litBufferLocation == ZSTD_split)
+@@ -310,6 +340,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+     }
+ }
+ 
++/* Hidden declaration for fullbench */
++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
++                          const void* src, size_t srcSize,
++                          void* dst, size_t dstCapacity);
++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
++                          const void* src, size_t srcSize,
++                          void* dst, size_t dstCapacity)
++{
++    dctx->isFrameDecompression = 0;
++    return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming);
++}
++
+ /* Default FSE distribution tables.
+  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
+  * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
+@@ -506,14 +548,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
                  for (i = 8; i < n; i += 8) {
                      MEM_write64(spread + pos + i, sv);
                  }
@@ -22536,7 +31448,7 @@ index c1913b8e7c89..9f5577e5bc19 100644
           */
          {
              size_t position = 0;
-@@ -540,7 +552,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+@@ -540,7 +583,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
              for (i=0; i<n; i++) {
                  tableDecode[position].baseValue = s;
                  position = (position + step) & tableMask;
@@ -22545,7 +31457,7 @@ index c1913b8e7c89..9f5577e5bc19 100644
          }   }
          assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
      }
-@@ -551,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
+@@ -551,7 +594,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
          for (u=0; u<tableSize; u++) {
              U32 const symbol = tableDecode[u].baseValue;
              U32 const nextState = symbolNext[symbol]++;
@@ -22554,7 +31466,69 @@ index c1913b8e7c89..9f5577e5bc19 100644
              tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
              assert(nbAdditionalBits[symbol] < 255);
              tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
-@@ -964,6 +976,11 @@ size_t ZSTD_execSequence(BYTE* op,
+@@ -664,11 +707,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+ 
+     /* SeqHead */
+     nbSeq = *ip++;
+-    if (!nbSeq) {
+-        *nbSeqPtr=0;
+-        RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
+-        return 1;
+-    }
+     if (nbSeq > 0x7F) {
+         if (nbSeq == 0xFF) {
+             RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
+@@ -681,8 +719,16 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+     }
+     *nbSeqPtr = nbSeq;
+ 
++    if (nbSeq == 0) {
++        /* No sequence : section ends immediately */
++        RETURN_ERROR_IF(ip != iend, corruption_detected,
++            "extraneous data present in the Sequences section");
++        return (size_t)(ip - istart);
++    }
++
+     /* FSE table descriptors */
+     RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
++    RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */
+     {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+         symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+         symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
+@@ -829,7 +875,7 @@ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, pt
+ /* ZSTD_safecopyDstBeforeSrc():
+  * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
+  * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
+-static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
++static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) {
+     ptrdiff_t const diff = op - ip;
+     BYTE* const oend = op + length;
+ 
+@@ -858,6 +904,7 @@ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length
+  * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
+  */
+ FORCE_NOINLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceEnd(BYTE* op,
+     BYTE* const oend, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -905,6 +952,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
+  * This version is intended to be used during instances where the litBuffer is still split.  It is kept separate to avoid performance impact for the good case.
+  */
+ FORCE_NOINLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
+     BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -950,6 +998,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
+ }
+ 
+ HINT_INLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequence(BYTE* op,
+     BYTE* const oend, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -964,6 +1013,11 @@ size_t ZSTD_execSequence(BYTE* op,
  
      assert(op != NULL /* Precondition */);
      assert(oend_w < oend /* No underflow */);
@@ -22566,7 +31540,15 @@ index c1913b8e7c89..9f5577e5bc19 100644
      /* Handle edge cases in a slow path:
       *   - Read beyond end of literals
       *   - Match end is within WILDCOPY_OVERLIMIT of oend
-@@ -1154,7 +1171,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
+@@ -1043,6 +1097,7 @@ size_t ZSTD_execSequence(BYTE* op,
+ }
+ 
+ HINT_INLINE
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
+     BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+     const BYTE** litPtr, const BYTE* const litLimit,
+@@ -1154,7 +1209,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
  }
  
  /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
@@ -22575,14 +31557,25 @@ index c1913b8e7c89..9f5577e5bc19 100644
   * bits before reloading. This value is the maximum number of bytes we read
   * after reloading when we are decoding long offsets.
   */
-@@ -1169,9 +1186,27 @@ FORCE_INLINE_TEMPLATE seq_t
- ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+@@ -1165,13 +1220,37 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
+ 
+ typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
+ 
++/*
++ * ZSTD_decodeSequence():
++ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets
++ *                  only used in 32-bit mode
++ * @return : Sequence (litL + matchL + offset)
++ */
+ FORCE_INLINE_TEMPLATE seq_t
+-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
++ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq)
  {
      seq_t seq;
 +    /*
-+     * ZSTD_seqSymbol is a structure with a total of 64 bits wide. So it can be
-+     * loaded in one operation and extracted its fields by simply shifting or
-+     * bit-extracting on aarch64.
++     * ZSTD_seqSymbol is a 64 bits wide structure.
++     * It can be loaded in one operation
++     * and its fields extracted by simply shifting or bit-extracting on aarch64.
 +     * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
 +     * operations that cause performance drop. This can be avoided by using this
 +     * ZSTD_memcpy hack.
@@ -22603,7 +31596,7 @@ index c1913b8e7c89..9f5577e5bc19 100644
      seq.matchLength = mlDInfo->baseValue;
      seq.litLength = llDInfo->baseValue;
      {   U32 const ofBase = ofDInfo->baseValue;
-@@ -1186,28 +1221,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+@@ -1186,28 +1265,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
          U32 const llnbBits = llDInfo->nbBits;
          U32 const mlnbBits = mlDInfo->nbBits;
          U32 const ofnbBits = ofDInfo->nbBits;
@@ -22644,7 +31637,16 @@ index c1913b8e7c89..9f5577e5bc19 100644
                  } else {
                      offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
                      if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
-@@ -1232,11 +1270,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+@@ -1224,7 +1306,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+                 } else {
+                     offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
+                     {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+-                        temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
++                        temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */
+                         if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                         seqState->prevOffset[1] = seqState->prevOffset[0];
+                         seqState->prevOffset[0] = offset = temp;
+@@ -1232,11 +1314,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
              seq.offset = offset;
          }
  
@@ -22656,7 +31658,7 @@ index c1913b8e7c89..9f5577e5bc19 100644
              seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
  
          if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
-@@ -1246,11 +1280,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+@@ -1246,11 +1324,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
          /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
          ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
  
@@ -22668,16 +31670,656 @@ index c1913b8e7c89..9f5577e5bc19 100644
              seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
  
          if (MEM_32bits())
-@@ -1552,7 +1582,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+@@ -1259,17 +1333,22 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+         DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+                     (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+ 
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
+-        if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+-        ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
++        if (!isLastSeq) {
++            /* don't update FSE state for last Sequence */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
++            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
++            ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
++            BIT_reloadDStream(&seqState->DStream);
++        }
+     }
+ 
+     return seq;
+ }
+ 
+-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+-MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
++#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
++#if DEBUGLEVEL >= 1
++static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
+ {
+     size_t const windowSize = dctx->fParams.windowSize;
+     /* No dictionary used. */
+@@ -1283,30 +1362,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix
+     /* Dictionary is active. */
+     return 1;
+ }
++#endif
+ 
+-MEM_STATIC void ZSTD_assertValidSequence(
++static void ZSTD_assertValidSequence(
+         ZSTD_DCtx const* dctx,
+         BYTE const* op, BYTE const* oend,
+         seq_t const seq,
+         BYTE const* prefixStart, BYTE const* virtualStart)
+ {
+ #if DEBUGLEVEL >= 1
+-    size_t const windowSize = dctx->fParams.windowSize;
+-    size_t const sequenceSize = seq.litLength + seq.matchLength;
+-    BYTE const* const oLitEnd = op + seq.litLength;
+-    DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
+-            (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+-    assert(op <= oend);
+-    assert((size_t)(oend - op) >= sequenceSize);
+-    assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
+-    if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
+-        size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
+-        /* Offset must be within the dictionary. */
+-        assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
+-        assert(seq.offset <= windowSize + dictSize);
+-    } else {
+-        /* Offset must be within our window. */
+-        assert(seq.offset <= windowSize);
++    if (dctx->isFrameDecompression) {
++        size_t const windowSize = dctx->fParams.windowSize;
++        size_t const sequenceSize = seq.litLength + seq.matchLength;
++        BYTE const* const oLitEnd = op + seq.litLength;
++        DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
++                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
++        assert(op <= oend);
++        assert((size_t)(oend - op) >= sequenceSize);
++        assert(sequenceSize <= ZSTD_blockSizeMax(dctx));
++        if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
++            size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
++            /* Offset must be within the dictionary. */
++            assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
++            assert(seq.offset <= windowSize + dictSize);
++        } else {
++            /* Offset must be within our window. */
++            assert(seq.offset <= windowSize);
++        }
+     }
+ #else
+     (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
+@@ -1322,23 +1404,21 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                                void* dst, size_t maxDstSize,
+                          const void* seqStart, size_t seqSize, int nbSeq,
+-                         const ZSTD_longOffset_e isLongOffset,
+-                         const int frame)
++                         const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = ostart + maxDstSize;
++    BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* litBufferEnd = dctx->litBufferEnd;
+     const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+     const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+-    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
+-    (void)frame;
++    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq);
+ 
+-    /* Regen sequences */
++    /* Literals are split between internal buffer & output buffer */
+     if (nbSeq) {
+         seqState_t seqState;
+         dctx->fseEntropy = 1;
+@@ -1357,8 +1437,7 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                 BIT_DStream_completed < BIT_DStream_overflow);
+ 
+         /* decompress without overrunning litPtr begins */
+-        {
+-            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        {   seq_t sequence = {0,0,0};  /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */
+             /* Align the decompression loop to 32 + 16 bytes.
+                 *
+                 * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
+@@ -1420,27 +1499,26 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+ #endif
+ 
+             /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
+-            for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
+-                size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
++            for ( ; nbSeq; nbSeq--) {
++                sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
++                if (litPtr + sequence.litLength > dctx->litBufferEnd) break;
++                {   size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+-                assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                    assert(!ZSTD_isError(oneSeqSize));
++                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+-                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+-                    return oneSeqSize;
+-                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+-                op += oneSeqSize;
+-                if (UNLIKELY(!--nbSeq))
+-                    break;
+-                BIT_reloadDStream(&(seqState.DStream));
+-                sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+-            }
++                    if (UNLIKELY(ZSTD_isError(oneSeqSize)))
++                        return oneSeqSize;
++                    DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
++                    op += oneSeqSize;
++            }   }
++            DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)");
+ 
+             /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
+             if (nbSeq > 0) {
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+-                if (leftoverLit)
+-                {
++                DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength);
++                if (leftoverLit) {
+                     RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                     ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                     sequence.litLength -= leftoverLit;
+@@ -1449,24 +1527,22 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                {
+-                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                     assert(!ZSTD_isError(oneSeqSize));
+-                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+                     if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                         return oneSeqSize;
+                     DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                     op += oneSeqSize;
+-                    if (--nbSeq)
+-                        BIT_reloadDStream(&(seqState.DStream));
+                 }
++                nbSeq--;
+             }
+         }
+ 
+-        if (nbSeq > 0) /* there is remaining lit from extra buffer */
+-        {
++        if (nbSeq > 0) {
++            /* there is remaining lit from extra buffer */
+ 
+ #if defined(__x86_64__)
+             __asm__(".p2align 6");
+@@ -1485,35 +1561,34 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+ #  endif
+ #endif
+ 
+-            for (; ; ) {
+-                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++            for ( ; nbSeq ; nbSeq--) {
++                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+                 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++                ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+                 if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                     return oneSeqSize;
+                 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                 op += oneSeqSize;
+-                if (UNLIKELY(!--nbSeq))
+-                    break;
+-                BIT_reloadDStream(&(seqState.DStream));
+             }
+         }
+ 
+         /* check if reached exact end */
+         DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
+         RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+-        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
++        DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed);
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+         /* save reps for next block */
+         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+     }
+ 
+     /* last literal segment */
+-    if (dctx->litBufferLocation == ZSTD_split)  /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
+-    {
+-        size_t const lastLLSize = litBufferEnd - litPtr;
++    if (dctx->litBufferLocation == ZSTD_split) {
++        /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
++        size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
++        DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memmove(op, litPtr, lastLLSize);
+@@ -1523,15 +1598,17 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
+         litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+         dctx->litBufferLocation = ZSTD_not_in_dst;
+     }
+-    {   size_t const lastLLSize = litBufferEnd - litPtr;
++    /* copy last literals from internal buffer */
++    {   size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
++        DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memcpy(op, litPtr, lastLLSize);
+             op += lastLLSize;
+-        }
+-    }
++    }   }
+ 
+-    return op-ostart;
++    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
++    return (size_t)(op - ostart);
+ }
+ 
+ FORCE_INLINE_TEMPLATE size_t
+@@ -1539,21 +1616,19 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+     void* dst, size_t maxDstSize,
+     const void* seqStart, size_t seqSize, int nbSeq,
+-    const ZSTD_longOffset_e isLongOffset,
+-    const int frame)
++    const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
++    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer;
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* const litEnd = litPtr + dctx->litSize;
      const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
      const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
      const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
 -    DEBUGLOG(5, "ZSTD_decompressSequences_body");
+-    (void)frame;
 +    DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
-     (void)frame;
  
      /* Regen sequences */
-@@ -1945,34 +1975,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+     if (nbSeq) {
+@@ -1568,11 +1643,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+         ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+         assert(dst != NULL);
+ 
+-        ZSTD_STATIC_ASSERT(
+-            BIT_DStream_unfinished < BIT_DStream_completed &&
+-            BIT_DStream_endOfBuffer < BIT_DStream_completed &&
+-            BIT_DStream_completed < BIT_DStream_overflow);
+-
+ #if defined(__x86_64__)
+             __asm__(".p2align 6");
+             __asm__("nop");
+@@ -1587,73 +1657,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+ #  endif
+ #endif
+ 
+-        for ( ; ; ) {
+-            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        for ( ; nbSeq ; nbSeq--) {
++            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+             size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+             assert(!ZSTD_isError(oneSeqSize));
+-            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
++            ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+ #endif
+             if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                 return oneSeqSize;
+             DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+             op += oneSeqSize;
+-            if (UNLIKELY(!--nbSeq))
+-                break;
+-            BIT_reloadDStream(&(seqState.DStream));
+         }
+ 
+         /* check if reached exact end */
+-        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
+-        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+-        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
++        assert(nbSeq == 0);
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+         /* save reps for next block */
+         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+     }
+ 
+     /* last literal segment */
+-    {   size_t const lastLLSize = litEnd - litPtr;
++    {   size_t const lastLLSize = (size_t)(litEnd - litPtr);
++        DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize);
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+         if (op != NULL) {
+             ZSTD_memcpy(op, litPtr, lastLLSize);
+             op += lastLLSize;
+-        }
+-    }
++    }   }
+ 
+-    return op-ostart;
++    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
++    return (size_t)(op - ostart);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
+                                                void* dst, size_t maxDstSize,
+                                          const void* seqStart, size_t seqSize, int nbSeq,
+-                                         const ZSTD_longOffset_e isLongOffset,
+-                                         const int frame)
++                                         const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+ 
+-FORCE_INLINE_TEMPLATE size_t
+-ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
++FORCE_INLINE_TEMPLATE
++
++size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
+                    const BYTE* const prefixStart, const BYTE* const dictEnd)
+ {
+     prefetchPos += sequence.litLength;
+     {   const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
+-        const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
+-                                                                              * No consequence though : memory address is only used for prefetching, not for dereferencing */
++        /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
++         * No consequence though : memory address is only used for prefetching, not for dereferencing */
++        const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset);
+         PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE);   /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+     }
+     return prefetchPos + sequence.matchLength;
+@@ -1668,20 +1735,18 @@ ZSTD_decompressSequencesLong_body(
+                                ZSTD_DCtx* dctx,
+                                void* dst, size_t maxDstSize,
+                          const void* seqStart, size_t seqSize, int nbSeq,
+-                         const ZSTD_longOffset_e isLongOffset,
+-                         const int frame)
++                         const ZSTD_longOffset_e isLongOffset)
+ {
+     const BYTE* ip = (const BYTE*)seqStart;
+     const BYTE* const iend = ip + seqSize;
+     BYTE* const ostart = (BYTE*)dst;
+-    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
++    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
+     BYTE* op = ostart;
+     const BYTE* litPtr = dctx->litPtr;
+     const BYTE* litBufferEnd = dctx->litBufferEnd;
+     const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+     const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
+     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+-    (void)frame;
+ 
+     /* Regen sequences */
+     if (nbSeq) {
+@@ -1706,20 +1771,17 @@ ZSTD_decompressSequencesLong_body(
+         ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+ 
+         /* prepare in advance */
+-        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
+-            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
++        for (seqNb=0; seqNb<seqAdvance; seqNb++) {
++            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
+             prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+             sequences[seqNb] = sequence;
+         }
+-        RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
+ 
+         /* decompress without stomping litBuffer */
+-        for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
+-            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+-            size_t oneSeqSize;
++        for (; seqNb < nbSeq; seqNb++) {
++            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
+ 
+-            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
+-            {
++            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) {
+                 /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+                 if (leftoverLit)
+@@ -1732,26 +1794,26 @@ ZSTD_decompressSequencesLong_body(
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+-                assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
++                    assert(!ZSTD_isError(oneSeqSize));
++                    ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+-                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
++                    if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+ 
+-                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+-                sequences[seqNb & STORED_SEQS_MASK] = sequence;
+-                op += oneSeqSize;
+-            }
++                    prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
++                    sequences[seqNb & STORED_SEQS_MASK] = sequence;
++                    op += oneSeqSize;
++            }   }
+             else
+             {
+                 /* lit buffer is either wholly contained in first or second split, or not split at all*/
+-                oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
++                size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
+                     ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
+                     ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
++                ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+ 
+@@ -1760,17 +1822,15 @@ ZSTD_decompressSequencesLong_body(
+                 op += oneSeqSize;
+             }
+         }
+-        RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
++        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
+ 
+         /* finish queue */
+         seqNb -= seqAdvance;
+         for ( ; seqNb<nbSeq ; seqNb++) {
+             seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
+-            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
+-            {
++            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) {
+                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
+-                if (leftoverLit)
+-                {
++                if (leftoverLit) {
+                     RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
+                     ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                     sequence->litLength -= leftoverLit;
+@@ -1779,11 +1839,10 @@ ZSTD_decompressSequencesLong_body(
+                 litPtr = dctx->litExtraBuffer;
+                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+                 dctx->litBufferLocation = ZSTD_not_in_dst;
+-                {
+-                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
++                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                     assert(!ZSTD_isError(oneSeqSize));
+-                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
++                    ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                     if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                     op += oneSeqSize;
+@@ -1796,7 +1855,7 @@ ZSTD_decompressSequencesLong_body(
+                     ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+ #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+                 assert(!ZSTD_isError(oneSeqSize));
+-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
++                ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+ #endif
+                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                 op += oneSeqSize;
+@@ -1808,8 +1867,7 @@ ZSTD_decompressSequencesLong_body(
+     }
+ 
+     /* last literal segment */
+-    if (dctx->litBufferLocation == ZSTD_split)  /* first deplete literal buffer in dst, then copy litExtraBuffer */
+-    {
++    if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */
+         size_t const lastLLSize = litBufferEnd - litPtr;
+         RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
+         if (op != NULL) {
+@@ -1827,17 +1885,16 @@ ZSTD_decompressSequencesLong_body(
+         }
+     }
+ 
+-    return op-ostart;
++    return (size_t)(op - ostart);
+ }
+ 
+ static size_t
+ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+@@ -1851,20 +1908,18 @@ DONT_VECTORIZE
+ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ static BMI2_TARGET_ATTRIBUTE size_t
+ DONT_VECTORIZE
+ ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+@@ -1873,10 +1928,9 @@ static BMI2_TARGET_ATTRIBUTE size_t
+ ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
+                                  void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+-                           const ZSTD_longOffset_e isLongOffset,
+-                           const int frame)
++                           const ZSTD_longOffset_e isLongOffset)
+ {
+-    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+ 
+@@ -1886,37 +1940,34 @@ typedef size_t (*ZSTD_decompressSequences_t)(
+                             ZSTD_DCtx* dctx,
+                             void* dst, size_t maxDstSize,
+                             const void* seqStart, size_t seqSize, int nbSeq,
+-                            const ZSTD_longOffset_e isLongOffset,
+-                            const int frame);
++                            const ZSTD_longOffset_e isLongOffset);
+ 
+ #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+ static size_t
+ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                    const void* seqStart, size_t seqSize, int nbSeq,
+-                   const ZSTD_longOffset_e isLongOffset,
+-                   const int frame)
++                   const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequences");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ static size_t
+ ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                                  const void* seqStart, size_t seqSize, int nbSeq,
+-                                 const ZSTD_longOffset_e isLongOffset,
+-                                 const int frame)
++                                 const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
+ #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+ 
+@@ -1931,69 +1982,114 @@ static size_t
+ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+                              void* dst, size_t maxDstSize,
+                              const void* seqStart, size_t seqSize, int nbSeq,
+-                             const ZSTD_longOffset_e isLongOffset,
+-                             const int frame)
++                             const ZSTD_longOffset_e isLongOffset)
+ {
+     DEBUGLOG(5, "ZSTD_decompressSequencesLong");
+ #if DYNAMIC_BMI2
+     if (ZSTD_DCtx_get_bmi2(dctx)) {
+-        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+     }
+ #endif
+-  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
++  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
+ }
  #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
  
  
@@ -22777,8 +32419,9 @@ index c1913b8e7c89..9f5577e5bc19 100644
  
  size_t
  ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
-@@ -1980,20 +2055,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
-                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
+                               void* dst, size_t dstCapacity,
+-                        const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
++                        const void* src, size_t srcSize, const streaming_operation streaming)
  {   /* blockType == blockCompressed */
      const BYTE* ip = (const BYTE*)src;
 -    /* isLongOffset must be true if there are long offsets.
@@ -22788,18 +32431,20 @@ index c1913b8e7c89..9f5577e5bc19 100644
 -     * (note: but it could be evaluated from current-lowLimit)
 -     */
 -    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
-     DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
- 
+-    DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
+-
 -    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
++    DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize);
++
 +    /* Note : the wording of the specification
-+     * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX.
++     * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx).
 +     * This generally does not happen, as it makes little sense,
 +     * since an uncompressed block would feature same size and have no decompression cost.
 +     * Also, note that decoder from reference libzstd before < v1.5.4
 +     * would consider this edge case as an error.
-+     * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX
++     * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx)
 +     * for broader compatibility with the deployed ecosystem of zstd decoders */
-+    RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
++    RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, "");
  
      /* Decode literals section */
      {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
@@ -22808,15 +32453,15 @@ index c1913b8e7c89..9f5577e5bc19 100644
          if (ZSTD_isError(litCSize)) return litCSize;
          ip += litCSize;
          srcSize -= litCSize;
-@@ -2001,6 +2077,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+@@ -2001,6 +2097,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
  
      /* Build Decoding Tables */
      {
 +        /* Compute the maximum block size, which must also work when !frame and fParams are unset.
 +         * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
 +         */
-+        size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX));
-+        size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart);
++        size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx));
++        size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart);
 +        /* isLongOffset must be true if there are long offsets.
 +         * Offsets are long if they are larger than ZSTD_maxShortOffset().
 +         * We don't expect that to be the case in 64-bit mode.
@@ -22832,7 +32477,7 @@ index c1913b8e7c89..9f5577e5bc19 100644
          /* These macros control at build-time which decompressor implementation
           * we use. If neither is defined, we do some inspection and dispatch at
           * runtime.
-@@ -2008,6 +2101,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+@@ -2008,6 +2121,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
  #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
      !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
          int usePrefetchDecoder = dctx->ddictIsCold;
@@ -22844,7 +32489,7 @@ index c1913b8e7c89..9f5577e5bc19 100644
  #endif
          int nbSeq;
          size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
-@@ -2015,28 +2113,42 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+@@ -2015,40 +2133,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
          ip += seqHSize;
          srcSize -= seqHSize;
  
@@ -22892,13 +32537,29 @@ index c1913b8e7c89..9f5577e5bc19 100644
 +        {
  #endif
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
-             return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
+-            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
  #endif
 +        }
  
  #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
          /* else */
-@@ -2060,9 +2172,9 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+         if (dctx->litBufferLocation == ZSTD_split)
+-            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+         else
+-            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
++            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+ #endif
+     }
+ }
+ 
+ 
++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
+ {
+     if (dst != dctx->previousDstEnd && dstSize > 0) {   /* not contiguous */
+@@ -2060,13 +2193,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
  }
  
  
@@ -22910,8 +32571,11 @@ index c1913b8e7c89..9f5577e5bc19 100644
 +                                 const void* src, size_t srcSize)
  {
      size_t dSize;
++    dctx->isFrameDecompression = 0;
      ZSTD_checkContinuity(dctx, dst, dstCapacity);
-@@ -2070,3 +2182,12 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+-    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
++    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming);
++    FORWARD_IF_ERROR(dSize, "");
      dctx->previousDstEnd = (char*)dst + dSize;
      return dSize;
  }
@@ -22925,7 +32589,7 @@ index c1913b8e7c89..9f5577e5bc19 100644
 +    return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
 +}
 diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
-index 3d2d57a5d25a..5888e6cc788b 100644
+index 3d2d57a5d25a..becffbd89364 100644
 --- a/lib/zstd/decompress/zstd_decompress_block.h
 +++ b/lib/zstd/decompress/zstd_decompress_block.h
 @@ -1,5 +1,6 @@
@@ -22936,6 +32600,15 @@ index 3d2d57a5d25a..5888e6cc788b 100644
   * All rights reserved.
   *
   * This source code is licensed under both the BSD-style license (found in the
+@@ -47,7 +48,7 @@ typedef enum {
+  */
+ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                                void* dst, size_t dstCapacity,
+-                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming);
++                         const void* src, size_t srcSize, const streaming_operation streaming);
+ 
+ /* ZSTD_buildFSETable() :
+  * generate FSE decoding table for one symbol (ll, ml or off)
 @@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
                     unsigned tableLog, void* wksp, size_t wkspSize,
                     int bmi2);
@@ -22948,7 +32621,7 @@ index 3d2d57a5d25a..5888e6cc788b 100644
  
  #endif /* ZSTD_DEC_BLOCK_H */
 diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
-index 98102edb6a83..32f79fb2873d 100644
+index 98102edb6a83..0f02526be774 100644
 --- a/lib/zstd/decompress/zstd_decompress_internal.h
 +++ b/lib/zstd/decompress/zstd_decompress_internal.h
 @@ -1,5 +1,6 @@
@@ -22974,11 +32647,20 @@ index 98102edb6a83..32f79fb2873d 100644
      U32 rep[ZSTD_REP_NUM];
      U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
  } ZSTD_entropyDTables_t;
-@@ -164,6 +166,7 @@ struct ZSTD_DCtx_s
+@@ -152,6 +154,7 @@ struct ZSTD_DCtx_s
+     size_t litSize;
+     size_t rleSize;
+     size_t staticSize;
++    int isFrameDecompression;
+ #if DYNAMIC_BMI2 != 0
+     int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+ #endif
+@@ -164,6 +167,8 @@ struct ZSTD_DCtx_s
      ZSTD_dictUses_e dictUses;
      ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
      ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
 +    int disableHufAsm;
++    int maxBlockSizeParam;
  
      /* streaming */
      ZSTD_dStreamStage streamStage;
diff --git a/patches/series b/patches/series
index be1ab52..60ad529 100644
--- a/patches/series
+++ b/patches/series
@@ -1,7 +1,6 @@
 cachyos/0001-cachyos-base-all.patch
 cachyos/0001-bore-cachy.patch
 cachyos/0002-ntsync.patch
-cachyos/0003-nvidia.patch
 cachyos/0004-intel.patch
 nobara/0001-Allow-to-set-custom-USB-pollrate-for-specific-device.patch
 nobara/0001-Revert-PCI-Add-a-REBAR-size-quirk-for-Sapphire-RX-56.patch
@@ -9,7 +8,6 @@ nobara/0001-Revert-nvme-pci-drop-redundant-pci_enable_pcie_error.patch
 nobara/0001-Set-amdgpu.ppfeaturemask-0xffffffff-as-default.patch
 nobara/0001-acpi-proc-idle-skip-dummy-wait.patch
 nobara/0001-add-acpi_call.patch
-nobara/OpenRGB.patch
 nobara/amdgpu-si-cik-default.patch
 asuslinux/0001-platform-x86-asus-wmi-add-support-for-2024-ROG-Mini-.patch
 asuslinux/0002-platform-x86-asus-wmi-add-support-for-Vivobook-GPU-M.patch
diff --git a/scripts/config.sh b/scripts/config.sh
index 562540c..8e6927f 100755
--- a/scripts/config.sh
+++ b/scripts/config.sh
@@ -4,11 +4,12 @@ echo "Pika Kernel - Applying configuration"
 
 cp ../config .config
 
-scripts/config -k -e CONFIG_GENERIC_CPU
+scripts/config -k -d CONFIG_GENERIC_CPU
+scripts/config -k -e CONFIG_GENERIC_CPU2
 scripts/config -e CACHY
 scripts/config -e SCHED_BORE
 
-scripts/config -e HZ_300 --set-val HZ 500
+scripts/config -e HZ_300 --set-val HZ 750
 scripts/config -d HZ_PERIODIC -d NO_HZ_IDLE -d CONTEXT_TRACKING_FORCE -e NO_HZ_FULL_NODEF -e NO_HZ_FULL -e NO_HZ -e NO_HZ_COMMON -e CONTEXT_TRACKING
 scripts/config -e PREEMPT_BUILD -d PREEMPT_NONE -d PREEMPT_VOLUNTARY -e PREEMPT -e PREEMPT_COUNT -e PREEMPTION -e PREEMPT_DYNAMIC