From e928143484ea185cb948dc2854ed9655e6096735 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sun, 8 Sep 2024 10:19:12 +0200
Subject: [PATCH 01/12] amd-pstate

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 Documentation/admin-guide/pm/amd-pstate.rst |  18 +-
 arch/x86/include/asm/cpufeatures.h          |   1 +
 arch/x86/include/asm/msr-index.h            |   2 +
 arch/x86/kernel/cpu/scattered.c             |   1 +
 drivers/cpufreq/Kconfig.x86                 |   1 +
 drivers/cpufreq/acpi-cpufreq.c              |   3 +-
 drivers/cpufreq/amd-pstate.c                | 326 +++++++++++++++-----
 drivers/cpufreq/amd-pstate.h                |   1 +
 drivers/cpufreq/cpufreq.c                   |  11 +-
 9 files changed, 279 insertions(+), 85 deletions(-)

diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
index 1e0d101b020a..d0324d44f548 100644
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -281,6 +281,22 @@ integer values defined between 0 to 255 when EPP feature is enabled by platform
 firmware, if EPP feature is disabled, driver will ignore the written value
 This attribute is read-write.
 
+``boost``
+The `boost` sysfs attribute provides control over the CPU core
+performance boost, allowing users to manage the maximum frequency limitation
+of the CPU. This attribute can be used to enable or disable the boost feature
+on individual CPUs.
+
+When the boost feature is enabled, the CPU can dynamically increase its frequency
+beyond the base frequency, providing enhanced performance for demanding workloads.
+On the other hand, disabling the boost feature restricts the CPU to operate at the
+base frequency, which may be desirable in certain scenarios to prioritize power
+efficiency or manage temperature.
+
+To manipulate the `boost` attribute, users can write a value of `0` to disable the
+boost or `1` to enable it, for the respective CPU using the sysfs path
+`/sys/devices/system/cpu/cpuX/cpufreq/boost`, where `X` represents the CPU number.
+
 Other performance and frequency values can be read back from
 ``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`.
 
@@ -406,7 +422,7 @@ control its functionality at the system level.  They are located in the
 ``/sys/devices/system/cpu/amd_pstate/`` directory and affect all CPUs.
 
 ``status``
-	Operation mode of the driver: "active", "passive" or "disable".
+	Operation mode of the driver: "active", "passive", "guided" or "disable".
 
 	"active"
 		The driver is functional and in the ``active mode``
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 3c7434329661..6c128d463a14 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -470,6 +470,7 @@
 #define X86_FEATURE_BHI_CTRL		(21*32+ 2) /* "" BHI_DIS_S HW control available */
 #define X86_FEATURE_CLEAR_BHB_HW	(21*32+ 3) /* "" BHI_DIS_S HW control enabled */
 #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */
+#define X86_FEATURE_FAST_CPPC		(21*32 + 5) /* "" AMD Fast CPPC */
 
 /*
  * BUG word(s)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e022e6eb766c..384739d592af 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -781,6 +781,8 @@
 #define MSR_K7_HWCR_IRPERF_EN		BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT)
 #define MSR_K7_FID_VID_CTL		0xc0010041
 #define MSR_K7_FID_VID_STATUS		0xc0010042
+#define MSR_K7_HWCR_CPB_DIS_BIT		25
+#define MSR_K7_HWCR_CPB_DIS		BIT_ULL(MSR_K7_HWCR_CPB_DIS_BIT)
 
 /* K6 MSRs */
 #define MSR_K6_WHCR			0xc0000082
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index af5aa2c754c2..c84c30188fdf 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -45,6 +45,7 @@ static const struct cpuid_bit cpuid_bits[] = {
 	{ X86_FEATURE_HW_PSTATE,	CPUID_EDX,  7, 0x80000007, 0 },
 	{ X86_FEATURE_CPB,		CPUID_EDX,  9, 0x80000007, 0 },
 	{ X86_FEATURE_PROC_FEEDBACK,    CPUID_EDX, 11, 0x80000007, 0 },
+	{ X86_FEATURE_FAST_CPPC, 	CPUID_EDX, 15, 0x80000007, 0 },
 	{ X86_FEATURE_MBA,		CPUID_EBX,  6, 0x80000008, 0 },
 	{ X86_FEATURE_SMBA,		CPUID_EBX,  2, 0x80000020, 0 },
 	{ X86_FEATURE_BMEC,		CPUID_EBX,  3, 0x80000020, 0 },
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index 438c9e75a04d..97c2d4f15d76 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -71,6 +71,7 @@ config X86_AMD_PSTATE_DEFAULT_MODE
 config X86_AMD_PSTATE_UT
 	tristate "selftest for AMD Processor P-State driver"
 	depends on X86 && ACPI_PROCESSOR
+	depends on X86_AMD_PSTATE
 	default n
 	help
 	  This kernel module is used for testing. It's safe to say M here.
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 4ac3a35dcd98..f4f8587c4ea0 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -50,8 +50,6 @@ enum {
 #define AMD_MSR_RANGE		(0x7)
 #define HYGON_MSR_RANGE		(0x7)
 
-#define MSR_K7_HWCR_CPB_DIS	(1ULL << 25)
-
 struct acpi_cpufreq_data {
 	unsigned int resume;
 	unsigned int cpu_feature;
@@ -139,6 +137,7 @@ static int set_boost(struct cpufreq_policy *policy, int val)
 			 (void *)(long)val, 1);
 	pr_debug("CPU %*pbl: Core Boosting %s.\n",
 		 cpumask_pr_args(policy->cpus), str_enabled_disabled(val));
+	policy->boost_enabled = val;
 
 	return 0;
 }
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 6f59403c91e0..bcf2f5a63376 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -51,6 +51,7 @@
 
 #define AMD_PSTATE_TRANSITION_LATENCY	20000
 #define AMD_PSTATE_TRANSITION_DELAY	1000
+#define AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY 600
 #define CPPC_HIGHEST_PERF_PERFORMANCE	196
 #define CPPC_HIGHEST_PERF_DEFAULT	166
 
@@ -85,15 +86,6 @@ struct quirk_entry {
 	u32 lowest_freq;
 };
 
-/*
- * TODO: We need more time to fine tune processors with shared memory solution
- * with community together.
- *
- * There are some performance drops on the CPU benchmarks which reports from
- * Suse. We are co-working with them to fine tune the shared memory solution. So
- * we disable it by default to go acpi-cpufreq on these processors and add a
- * module parameter to be able to enable it manually for debugging.
- */
 static struct cpufreq_driver *current_pstate_driver;
 static struct cpufreq_driver amd_pstate_driver;
 static struct cpufreq_driver amd_pstate_epp_driver;
@@ -150,6 +142,11 @@ static struct quirk_entry quirk_amd_7k62 = {
 	.lowest_freq = 550,
 };
 
+static struct quirk_entry quirk_amd_mts = {
+	.nominal_freq = 3600,
+	.lowest_freq = 550,
+};
+
 static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi)
 {
 	/**
@@ -157,7 +154,7 @@ static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi)
 	 * broken BIOS lack of nominal_freq and lowest_freq capabilities
 	 * definition in ACPI tables
 	 */
-	if (boot_cpu_has(X86_FEATURE_ZEN2)) {
+	if (cpu_feature_enabled(X86_FEATURE_ZEN2)) {
 		quirks = dmi->driver_data;
 		pr_info("Overriding nominal and lowest frequencies for %s\n", dmi->ident);
 		return 1;
@@ -166,6 +163,21 @@ static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi)
 	return 0;
 }
 
+static int __init dmi_matched_mts_bios_bug(const struct dmi_system_id *dmi)
+{
+	/**
+	 * match the broken bios for ryzen 3000 series processor support CPPC V2
+	 * broken BIOS lack of nominal_freq and lowest_freq capabilities
+	 * definition in ACPI tables
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_ZEN2)) {
+		quirks = dmi->driver_data;
+		pr_info("Overriding nominal and lowest frequencies for %s\n", dmi->ident);
+		return 1;
+	}
+
+	return 0;
+}
 static const struct dmi_system_id amd_pstate_quirks_table[] __initconst = {
 	{
 		.callback = dmi_matched_7k62_bios_bug,
@@ -176,6 +188,16 @@ static const struct dmi_system_id amd_pstate_quirks_table[] __initconst = {
 		},
 		.driver_data = &quirk_amd_7k62,
 	},
+	{
+		.callback = dmi_matched_mts_bios_bug,
+		.ident = "AMD Ryzen 3000",
+		.matches = {
+			DMI_MATCH(DMI_PRODUCT_NAME, "B450M MORTAR MAX (MS-7B89)"),
+			DMI_MATCH(DMI_BIOS_RELEASE, "06/10/2020"),
+			DMI_MATCH(DMI_BIOS_VERSION, "5.14"),
+		},
+		.driver_data = &quirk_amd_mts,
+	},
 	{}
 };
 MODULE_DEVICE_TABLE(dmi, amd_pstate_quirks_table);
@@ -199,7 +221,7 @@ static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached)
 	u64 epp;
 	int ret;
 
-	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
 		if (!cppc_req_cached) {
 			epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ,
 					&cppc_req_cached);
@@ -272,7 +294,7 @@ static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp)
 	int ret;
 	struct cppc_perf_ctrls perf_ctrls;
 
-	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
 		u64 value = READ_ONCE(cpudata->cppc_req_cached);
 
 		value &= ~GENMASK_ULL(31, 24);
@@ -522,7 +544,10 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata)
 static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
 			      u32 des_perf, u32 max_perf, bool fast_switch, int gov_flags)
 {
+	unsigned long max_freq;
+	struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu);
 	u64 prev = READ_ONCE(cpudata->cppc_req_cached);
+	u32 nominal_perf = READ_ONCE(cpudata->nominal_perf);
 	u64 value = prev;
 
 	min_perf = clamp_t(unsigned long, min_perf, cpudata->min_limit_perf,
@@ -531,6 +556,9 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
 			cpudata->max_limit_perf);
 	des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf);
 
+	max_freq = READ_ONCE(cpudata->max_limit_freq);
+	policy->cur = div_u64(des_perf * max_freq, max_perf);
+
 	if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) {
 		min_perf = des_perf;
 		des_perf = 0;
@@ -542,6 +570,10 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
 	value &= ~AMD_CPPC_DES_PERF(~0L);
 	value |= AMD_CPPC_DES_PERF(des_perf);
 
+	/* limit the max perf when core performance boost feature is disabled */
+	if (!cpudata->boost_supported)
+		max_perf = min_t(unsigned long, nominal_perf, max_perf);
+
 	value &= ~AMD_CPPC_MAX_PERF(~0L);
 	value |= AMD_CPPC_MAX_PERF(max_perf);
 
@@ -552,12 +584,15 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
 	}
 
 	if (value == prev)
-		return;
+		goto cpufreq_policy_put;
 
 	WRITE_ONCE(cpudata->cppc_req_cached, value);
 
 	amd_pstate_update_perf(cpudata, min_perf, des_perf,
 			       max_perf, fast_switch);
+
+cpufreq_policy_put:
+	cpufreq_cpu_put(policy);
 }
 
 static int amd_pstate_verify(struct cpufreq_policy_data *policy)
@@ -652,10 +687,9 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
 				   unsigned long capacity)
 {
 	unsigned long max_perf, min_perf, des_perf,
-		      cap_perf, lowest_nonlinear_perf, max_freq;
+		      cap_perf, lowest_nonlinear_perf;
 	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
 	struct amd_cpudata *cpudata = policy->driver_data;
-	unsigned int target_freq;
 
 	if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq)
 		amd_pstate_update_min_max_limit(policy);
@@ -663,7 +697,6 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
 
 	cap_perf = READ_ONCE(cpudata->highest_perf);
 	lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf);
-	max_freq = READ_ONCE(cpudata->max_freq);
 
 	des_perf = cap_perf;
 	if (target_perf < capacity)
@@ -681,51 +714,111 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
 		max_perf = min_perf;
 
 	des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf);
-	target_freq = div_u64(des_perf * max_freq, max_perf);
-	policy->cur = target_freq;
 
 	amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true,
 			policy->governor->flags);
 	cpufreq_cpu_put(policy);
 }
 
-static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state)
+static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on)
 {
 	struct amd_cpudata *cpudata = policy->driver_data;
-	int ret;
+	struct cppc_perf_ctrls perf_ctrls;
+	u32 highest_perf, nominal_perf, nominal_freq, max_freq;
+	int ret = 0;
 
-	if (!cpudata->boost_supported) {
-		pr_err("Boost mode is not supported by this processor or SBIOS\n");
-		return -EINVAL;
+	highest_perf = READ_ONCE(cpudata->highest_perf);
+	nominal_perf = READ_ONCE(cpudata->nominal_perf);
+	nominal_freq = READ_ONCE(cpudata->nominal_freq);
+	max_freq = READ_ONCE(cpudata->max_freq);
+
+	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+		u64 value = READ_ONCE(cpudata->cppc_req_cached);
+
+		value &= ~GENMASK_ULL(7, 0);
+		value |= on ? highest_perf : nominal_perf;
+		WRITE_ONCE(cpudata->cppc_req_cached, value);
+
+		wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
+	} else {
+		perf_ctrls.max_perf = on ? highest_perf : nominal_perf;
+		ret = cppc_set_perf(cpudata->cpu, &perf_ctrls);
+		if (ret) {
+			cpufreq_cpu_release(policy);
+			pr_debug("Failed to set max perf on CPU:%d. ret:%d\n",
+				cpudata->cpu, ret);
+			return ret;
+		}
 	}
 
-	if (state)
-		policy->cpuinfo.max_freq = cpudata->max_freq;
-	else
-		policy->cpuinfo.max_freq = cpudata->nominal_freq * 1000;
+	if (on)
+		policy->cpuinfo.max_freq = max_freq;
+	else if (policy->cpuinfo.max_freq > nominal_freq * 1000)
+		policy->cpuinfo.max_freq = nominal_freq * 1000;
 
 	policy->max = policy->cpuinfo.max_freq;
 
-	ret = freq_qos_update_request(&cpudata->req[1],
-				      policy->cpuinfo.max_freq);
-	if (ret < 0)
-		return ret;
+	if (cppc_state == AMD_PSTATE_PASSIVE) {
+		ret = freq_qos_update_request(&cpudata->req[1], policy->cpuinfo.max_freq);
+		if (ret < 0)
+			pr_debug("Failed to update freq constraint: CPU%d\n", cpudata->cpu);
+	}
 
-	return 0;
+	return ret < 0 ? ret : 0;
 }
 
-static void amd_pstate_boost_init(struct amd_cpudata *cpudata)
+static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state)
 {
-	u32 highest_perf, nominal_perf;
+	struct amd_cpudata *cpudata = policy->driver_data;
+	int ret;
 
-	highest_perf = READ_ONCE(cpudata->highest_perf);
-	nominal_perf = READ_ONCE(cpudata->nominal_perf);
+	if (!cpudata->boost_supported) {
+		pr_err("Boost mode is not supported by this processor or SBIOS\n");
+		return -EOPNOTSUPP;
+	}
+	mutex_lock(&amd_pstate_driver_lock);
+	ret = amd_pstate_cpu_boost_update(policy, state);
+	WRITE_ONCE(cpudata->boost_state, !ret ? state : false);
+	policy->boost_enabled = !ret ? state : false;
+	refresh_frequency_limits(policy);
+	mutex_unlock(&amd_pstate_driver_lock);
 
-	if (highest_perf <= nominal_perf)
-		return;
+	return ret;
+}
+
+static int amd_pstate_init_boost_support(struct amd_cpudata *cpudata)
+{
+	u64 boost_val;
+	int ret = -1;
 
-	cpudata->boost_supported = true;
+	/*
+	 * If platform has no CPB support or disable it, initialize current driver
+	 * boost_enabled state to be false, it is not an error for cpufreq core to handle.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_CPB)) {
+		pr_debug_once("Boost CPB capabilities not present in the processor\n");
+		ret = 0;
+		goto exit_err;
+	}
+
+	/* at least one CPU supports CPB, even if others fail later on to set up */
 	current_pstate_driver->boost_enabled = true;
+
+	ret = rdmsrl_on_cpu(cpudata->cpu, MSR_K7_HWCR, &boost_val);
+	if (ret) {
+		pr_err_once("failed to read initial CPU boost state!\n");
+		ret = -EIO;
+		goto exit_err;
+	}
+
+	if (!(boost_val & MSR_K7_HWCR_CPB_DIS))
+		cpudata->boost_supported = true;
+
+	return 0;
+
+exit_err:
+	cpudata->boost_supported = false;
+	return ret;
 }
 
 static void amd_perf_ctl_reset(unsigned int cpu)
@@ -754,7 +847,7 @@ static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf)
 {
 	int ret;
 
-	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
 		u64 cap1;
 
 		ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1);
@@ -850,8 +943,12 @@ static u32 amd_pstate_get_transition_delay_us(unsigned int cpu)
 	u32 transition_delay_ns;
 
 	transition_delay_ns = cppc_get_transition_latency(cpu);
-	if (transition_delay_ns == CPUFREQ_ETERNAL)
-		return AMD_PSTATE_TRANSITION_DELAY;
+	if (transition_delay_ns == CPUFREQ_ETERNAL) {
+		if (cpu_feature_enabled(X86_FEATURE_FAST_CPPC))
+			return AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY;
+		else
+			return AMD_PSTATE_TRANSITION_DELAY;
+	}
 
 	return transition_delay_ns / NSEC_PER_USEC;
 }
@@ -922,12 +1019,30 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata)
 	WRITE_ONCE(cpudata->nominal_freq, nominal_freq);
 	WRITE_ONCE(cpudata->max_freq, max_freq);
 
+	/**
+	 * Below values need to be initialized correctly, otherwise driver will fail to load
+	 * max_freq is calculated according to (nominal_freq * highest_perf)/nominal_perf
+	 * lowest_nonlinear_freq is a value between [min_freq, nominal_freq]
+	 * Check _CPC in ACPI table objects if any values are incorrect
+	 */
+	if (min_freq <= 0 || max_freq <= 0 || nominal_freq <= 0 || min_freq > max_freq) {
+		pr_err("min_freq(%d) or max_freq(%d) or nominal_freq(%d) value is incorrect\n",
+			min_freq, max_freq, nominal_freq * 1000);
+		return -EINVAL;
+	}
+
+	if (lowest_nonlinear_freq <= min_freq || lowest_nonlinear_freq > nominal_freq * 1000) {
+		pr_err("lowest_nonlinear_freq(%d) value is out of range [min_freq(%d), nominal_freq(%d)]\n",
+			lowest_nonlinear_freq, min_freq, nominal_freq * 1000);
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
 static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
 {
-	int min_freq, max_freq, nominal_freq, ret;
+	int min_freq, max_freq, ret;
 	struct device *dev;
 	struct amd_cpudata *cpudata;
 
@@ -956,18 +1071,12 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
 	if (ret)
 		goto free_cpudata1;
 
+	ret = amd_pstate_init_boost_support(cpudata);
+	if (ret)
+		goto free_cpudata1;
+
 	min_freq = READ_ONCE(cpudata->min_freq);
 	max_freq = READ_ONCE(cpudata->max_freq);
-	nominal_freq = READ_ONCE(cpudata->nominal_freq);
-
-	if (min_freq <= 0 || max_freq <= 0 ||
-	    nominal_freq <= 0 || min_freq > max_freq) {
-		dev_err(dev,
-			"min_freq(%d) or max_freq(%d) or nominal_freq (%d) value is incorrect, check _CPC in ACPI tables\n",
-			min_freq, max_freq, nominal_freq);
-		ret = -EINVAL;
-		goto free_cpudata1;
-	}
 
 	policy->cpuinfo.transition_latency = amd_pstate_get_transition_latency(policy->cpu);
 	policy->transition_delay_us = amd_pstate_get_transition_delay_us(policy->cpu);
@@ -978,10 +1087,12 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
 	policy->cpuinfo.min_freq = min_freq;
 	policy->cpuinfo.max_freq = max_freq;
 
+	policy->boost_enabled = READ_ONCE(cpudata->boost_supported);
+
 	/* It will be updated by governor */
 	policy->cur = policy->cpuinfo.min_freq;
 
-	if (boot_cpu_has(X86_FEATURE_CPPC))
+	if (cpu_feature_enabled(X86_FEATURE_CPPC))
 		policy->fast_switch_possible = true;
 
 	ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0],
@@ -1003,7 +1114,6 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
 
 	policy->driver_data = cpudata;
 
-	amd_pstate_boost_init(cpudata);
 	if (!current_pstate_driver->adjust_perf)
 		current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
 
@@ -1214,7 +1324,7 @@ static int amd_pstate_change_mode_without_dvr_change(int mode)
 
 	cppc_state = mode;
 
-	if (boot_cpu_has(X86_FEATURE_CPPC) || cppc_state == AMD_PSTATE_ACTIVE)
+	if (cpu_feature_enabled(X86_FEATURE_CPPC) || cppc_state == AMD_PSTATE_ACTIVE)
 		return 0;
 
 	for_each_present_cpu(cpu) {
@@ -1387,7 +1497,7 @@ static bool amd_pstate_acpi_pm_profile_undefined(void)
 
 static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
 {
-	int min_freq, max_freq, nominal_freq, ret;
+	int min_freq, max_freq, ret;
 	struct amd_cpudata *cpudata;
 	struct device *dev;
 	u64 value;
@@ -1418,17 +1528,12 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
 	if (ret)
 		goto free_cpudata1;
 
+	ret = amd_pstate_init_boost_support(cpudata);
+	if (ret)
+		goto free_cpudata1;
+
 	min_freq = READ_ONCE(cpudata->min_freq);
 	max_freq = READ_ONCE(cpudata->max_freq);
-	nominal_freq = READ_ONCE(cpudata->nominal_freq);
-	if (min_freq <= 0 || max_freq <= 0 ||
-	    nominal_freq <= 0 || min_freq > max_freq) {
-		dev_err(dev,
-			"min_freq(%d) or max_freq(%d) or nominal_freq(%d) value is incorrect, check _CPC in ACPI tables\n",
-			min_freq, max_freq, nominal_freq);
-		ret = -EINVAL;
-		goto free_cpudata1;
-	}
 
 	policy->cpuinfo.min_freq = min_freq;
 	policy->cpuinfo.max_freq = max_freq;
@@ -1442,6 +1547,8 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
 	policy->min = policy->cpuinfo.min_freq;
 	policy->max = policy->cpuinfo.max_freq;
 
+	policy->boost_enabled = READ_ONCE(cpudata->boost_supported);
+
 	/*
 	 * Set the policy to provide a valid fallback value in case
 	 * the default cpufreq governor is neither powersave nor performance.
@@ -1452,7 +1559,7 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
 	else
 		policy->policy = CPUFREQ_POLICY_POWERSAVE;
 
-	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
 		ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);
 		if (ret)
 			return ret;
@@ -1463,7 +1570,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
 			return ret;
 		WRITE_ONCE(cpudata->cppc_cap1_cached, value);
 	}
-	amd_pstate_boost_init(cpudata);
 
 	return 0;
 
@@ -1542,7 +1648,7 @@ static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
 		epp = 0;
 
 	/* Set initial EPP value */
-	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
 		value &= ~GENMASK_ULL(31, 24);
 		value |= (u64)epp << 24;
 	}
@@ -1565,6 +1671,12 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
 
 	amd_pstate_epp_update_limit(policy);
 
+	/*
+	 * policy->cur is never updated with the amd_pstate_epp driver, but it
+	 * is used as a stale frequency value. So, keep it within limits.
+	 */
+	policy->cur = policy->min;
+
 	return 0;
 }
 
@@ -1581,7 +1693,7 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata)
 	value = READ_ONCE(cpudata->cppc_req_cached);
 	max_perf = READ_ONCE(cpudata->highest_perf);
 
-	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
 		wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value);
 	} else {
 		perf_ctrls.max_perf = max_perf;
@@ -1615,7 +1727,7 @@ static void amd_pstate_epp_offline(struct cpufreq_policy *policy)
 	value = READ_ONCE(cpudata->cppc_req_cached);
 
 	mutex_lock(&amd_pstate_limits_lock);
-	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
 		cpudata->epp_policy = CPUFREQ_POLICY_UNKNOWN;
 
 		/* Set max perf same as min perf */
@@ -1719,6 +1831,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = {
 	.suspend	= amd_pstate_epp_suspend,
 	.resume		= amd_pstate_epp_resume,
 	.update_limits	= amd_pstate_update_limits,
+	.set_boost	= amd_pstate_set_boost,
 	.name		= "amd-pstate-epp",
 	.attr		= amd_pstate_epp_attr,
 };
@@ -1742,6 +1855,60 @@ static int __init amd_pstate_set_driver(int mode_idx)
 	return -EINVAL;
 }
 
+/**
+ * CPPC function is not supported for family ID 17H with model_ID ranging from 0x10 to 0x2F.
+ * show the debug message that helps to check if the CPU has CPPC support for loading issue.
+ */
+static bool amd_cppc_supported(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+	bool warn = false;
+
+	if ((boot_cpu_data.x86 == 0x17) && (boot_cpu_data.x86_model < 0x30)) {
+		pr_debug_once("CPPC feature is not supported by the processor\n");
+		return false;
+	}
+
+	/*
+	 * If the CPPC feature is disabled in the BIOS for processors
+	 * that support MSR-based CPPC, the AMD Pstate driver may not
+	 * function correctly.
+	 *
+	 * For such processors, check the CPPC flag and display a
+	 * warning message if the platform supports CPPC.
+	 *
+	 * Note: The code check below will not abort the driver
+	 * registration process because of the code is added for
+	 * debugging purposes. Besides, it may still be possible for
+	 * the driver to work using the shared-memory mechanism.
+	 */
+	if (!cpu_feature_enabled(X86_FEATURE_CPPC)) {
+		if (cpu_feature_enabled(X86_FEATURE_ZEN2)) {
+			switch (c->x86_model) {
+			case 0x60 ... 0x6F:
+			case 0x80 ... 0xAF:
+				warn = true;
+				break;
+			}
+		} else if (cpu_feature_enabled(X86_FEATURE_ZEN3) ||
+			   cpu_feature_enabled(X86_FEATURE_ZEN4)) {
+			switch (c->x86_model) {
+			case 0x10 ... 0x1F:
+			case 0x40 ... 0xAF:
+				warn = true;
+				break;
+			}
+		} else if (cpu_feature_enabled(X86_FEATURE_ZEN5)) {
+			warn = true;
+		}
+	}
+
+	if (warn)
+		pr_warn_once("The CPPC feature is supported but currently disabled by the BIOS.\n"
+					"Please enable it if your BIOS has the CPPC option.\n");
+	return true;
+}
+
 static int __init amd_pstate_init(void)
 {
 	struct device *dev_root;
@@ -1750,6 +1917,11 @@ static int __init amd_pstate_init(void)
 	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
 		return -ENODEV;
 
+	/* show debug message only if CPPC is not supported */
+	if (!amd_cppc_supported())
+		return -EOPNOTSUPP;
+
+	/* show warning message when BIOS broken or ACPI disabled */
 	if (!acpi_cpc_valid()) {
 		pr_warn_once("the _CPC object is not present in SBIOS or ACPI disabled\n");
 		return -ENODEV;
@@ -1774,11 +1946,9 @@ static int __init amd_pstate_init(void)
 		/* Disable on the following configs by default:
 		 * 1. Undefined platforms
 		 * 2. Server platforms
-		 * 3. Shared memory designs
 		 */
 		if (amd_pstate_acpi_pm_profile_undefined() ||
-		    amd_pstate_acpi_pm_profile_server() ||
-		    !boot_cpu_has(X86_FEATURE_CPPC)) {
+		    amd_pstate_acpi_pm_profile_server()) {
 			pr_info("driver load is disabled, boot with specific mode to enable this\n");
 			return -ENODEV;
 		}
@@ -1802,7 +1972,7 @@ static int __init amd_pstate_init(void)
 	}
 
 	/* capability check */
-	if (boot_cpu_has(X86_FEATURE_CPPC)) {
+	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
 		pr_debug("AMD CPPC MSR based functionality is supported\n");
 		if (cppc_state != AMD_PSTATE_ACTIVE)
 			current_pstate_driver->adjust_perf = amd_pstate_adjust_perf;
@@ -1821,8 +1991,10 @@ static int __init amd_pstate_init(void)
 	}
 
 	ret = cpufreq_register_driver(current_pstate_driver);
-	if (ret)
+	if (ret) {
 		pr_err("failed to register with return %d\n", ret);
+		goto disable_driver;
+	}
 
 	dev_root = bus_get_dev_root(&cpu_subsys);
 	if (dev_root) {
@@ -1838,6 +2010,8 @@ static int __init amd_pstate_init(void)
 
 global_attr_free:
 	cpufreq_unregister_driver(current_pstate_driver);
+disable_driver:
+	amd_pstate_enable(false);
 	return ret;
 }
 device_initcall(amd_pstate_init);
diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h
index f80b33fa5d43..cc8bb2bc325a 100644
--- a/drivers/cpufreq/amd-pstate.h
+++ b/drivers/cpufreq/amd-pstate.h
@@ -100,6 +100,7 @@ struct amd_cpudata {
 	u64	cppc_cap1_cached;
 	bool	suspended;
 	s16	epp_default;
+	bool	boost_state;
 };
 
 #endif /* _LINUX_AMD_PSTATE_H */
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 9e5060b27864..270ea04fb616 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -614,10 +614,9 @@ static ssize_t show_boost(struct kobject *kobj,
 static ssize_t store_boost(struct kobject *kobj, struct kobj_attribute *attr,
 			   const char *buf, size_t count)
 {
-	int ret, enable;
+	bool enable;
 
-	ret = sscanf(buf, "%d", &enable);
-	if (ret != 1 || enable < 0 || enable > 1)
+	if (kstrtobool(buf, &enable))
 		return -EINVAL;
 
 	if (cpufreq_boost_trigger_state(enable)) {
@@ -641,10 +640,10 @@ static ssize_t show_local_boost(struct cpufreq_policy *policy, char *buf)
 static ssize_t store_local_boost(struct cpufreq_policy *policy,
 				 const char *buf, size_t count)
 {
-	int ret, enable;
+	int ret;
+	bool enable;
 
-	ret = kstrtoint(buf, 10, &enable);
-	if (ret || enable < 0 || enable > 1)
+	if (kstrtobool(buf, &enable))
 		return -EINVAL;
 
 	if (!cpufreq_driver->boost_enabled)
-- 
2.46.0

From 44c9ce84f6eb7a7f146f67d9a6716ed779c888ce Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sun, 8 Sep 2024 10:19:24 +0200
Subject: [PATCH 02/12] bbr3

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 include/linux/tcp.h                |    4 +-
 include/net/inet_connection_sock.h |    4 +-
 include/net/tcp.h                  |   72 +-
 include/uapi/linux/inet_diag.h     |   23 +
 include/uapi/linux/rtnetlink.h     |    4 +-
 include/uapi/linux/tcp.h           |    1 +
 net/ipv4/Kconfig                   |   21 +-
 net/ipv4/bpf_tcp_ca.c              |    9 +-
 net/ipv4/tcp.c                     |    3 +
 net/ipv4/tcp_bbr.c                 | 2230 +++++++++++++++++++++-------
 net/ipv4/tcp_cong.c                |    1 +
 net/ipv4/tcp_input.c               |   40 +-
 net/ipv4/tcp_minisocks.c           |    2 +
 net/ipv4/tcp_output.c              |   48 +-
 net/ipv4/tcp_rate.c                |   30 +-
 net/ipv4/tcp_timer.c               |    1 +
 16 files changed, 1940 insertions(+), 553 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 6a5e08b937b3..27aab715490e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -369,7 +369,9 @@ struct tcp_sock {
 	u8	compressed_ack;
 	u8	dup_ack_counter:2,
 		tlp_retrans:1,	/* TLP is a retransmission */
-		unused:5;
+		fast_ack_mode:2, /* which fast ack mode ? */
+		tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */
+		unused:2;
 	u8	thin_lto    : 1,/* Use linear timeouts for thin streams */
 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
 		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index c0deaafebfdc..d53f042d936e 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -137,8 +137,8 @@ struct inet_connection_sock {
 	u32			  icsk_probes_tstamp;
 	u32			  icsk_user_timeout;
 
-	u64			  icsk_ca_priv[104 / sizeof(u64)];
-#define ICSK_CA_PRIV_SIZE	  sizeof_field(struct inet_connection_sock, icsk_ca_priv)
+#define ICSK_CA_PRIV_SIZE      (144)
+	u64			  icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)];
 };
 
 #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 45bbb54e42e8..bc10ec3e0c55 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -375,6 +375,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk)
 #define	TCP_ECN_QUEUE_CWR	2
 #define	TCP_ECN_DEMAND_CWR	4
 #define	TCP_ECN_SEEN		8
+#define	TCP_ECN_LOW		16
+#define	TCP_ECN_ECT_PERMANENT	32
 
 enum tcp_tw_status {
 	TCP_TW_SUCCESS = 0,
@@ -779,6 +781,15 @@ static inline void tcp_fast_path_check(struct sock *sk)
 
 u32 tcp_delack_max(const struct sock *sk);
 
+static inline void tcp_set_ecn_low_from_dst(struct sock *sk,
+					    const struct dst_entry *dst)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (dst_feature(dst, RTAX_FEATURE_ECN_LOW))
+		tp->ecn_flags |= TCP_ECN_LOW;
+}
+
 /* Compute the actual rto_min value */
 static inline u32 tcp_rto_min(const struct sock *sk)
 {
@@ -884,6 +895,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
 	return max_t(s64, t1 - t0, 0);
 }
 
+static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0)
+{
+	return max_t(s32, t1 - t0, 0);
+}
+
 /* provide the departure time in us unit */
 static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
 {
@@ -973,9 +989,14 @@ struct tcp_skb_cb {
 			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
 			__u32 delivered;
 			/* start of send pipeline phase */
-			u64 first_tx_mstamp;
+			u32 first_tx_mstamp;
 			/* when we reached the "delivered" count */
-			u64 delivered_mstamp;
+			u32 delivered_mstamp;
+#define TCPCB_IN_FLIGHT_BITS 20
+#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1)
+			u32 in_flight:20,   /* packets in flight at transmit */
+			    unused2:12;
+			u32 lost;	/* packets lost so far upon tx of skb */
 		} tx;   /* only used for outgoing skbs */
 		union {
 			struct inet_skb_parm	h4;
@@ -1079,6 +1100,7 @@ enum tcp_ca_event {
 	CA_EVENT_LOSS,		/* loss timeout */
 	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
 	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
+	CA_EVENT_TLP_RECOVERY,	/* a lost segment was repaired by TLP probe */
 };
 
 /* Information about inbound ACK, passed to cong_ops->in_ack_event() */
@@ -1101,7 +1123,11 @@ enum tcp_ca_ack_event_flags {
 #define TCP_CONG_NON_RESTRICTED 0x1
 /* Requires ECN/ECT set on all packets */
 #define TCP_CONG_NEEDS_ECN	0x2
-#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
+/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */
+#define TCP_CONG_WANTS_CE_EVENTS	0x4
+#define TCP_CONG_MASK	(TCP_CONG_NON_RESTRICTED | \
+			 TCP_CONG_NEEDS_ECN | \
+			 TCP_CONG_WANTS_CE_EVENTS)
 
 union tcp_cc_info;
 
@@ -1121,10 +1147,13 @@ struct ack_sample {
  */
 struct rate_sample {
 	u64  prior_mstamp; /* starting timestamp for interval */
+	u32  prior_lost;	/* tp->lost at "prior_mstamp" */
 	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
 	u32  prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */
+	u32 tx_in_flight;	/* packets in flight at starting timestamp */
+	s32  lost;		/* number of packets lost over interval */
 	s32  delivered;		/* number of packets delivered over interval */
-	s32  delivered_ce;	/* number of packets delivered w/ CE marks*/
+	s32  delivered_ce;	/* packets delivered w/ CE mark over interval */
 	long interval_us;	/* time for tp->delivered to incr "delivered" */
 	u32 snd_interval_us;	/* snd interval for delivered packets */
 	u32 rcv_interval_us;	/* rcv interval for delivered packets */
@@ -1135,7 +1164,9 @@ struct rate_sample {
 	u32  last_end_seq;	/* end_seq of most recently ACKed packet */
 	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
 	bool is_retrans;	/* is sample from retransmission? */
+	bool is_acking_tlp_retrans_seq;  /* ACKed a TLP retransmit sequence? */
 	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
+	bool is_ece;		/* did this ACK have ECN marked? */
 };
 
 struct tcp_congestion_ops {
@@ -1159,8 +1190,11 @@ struct tcp_congestion_ops {
 	/* hook for packet ack accounting (optional) */
 	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
 
-	/* override sysctl_tcp_min_tso_segs */
-	u32 (*min_tso_segs)(struct sock *sk);
+	/* pick target number of segments per TSO/GSO skb (optional): */
+	u32 (*tso_segs)(struct sock *sk, unsigned int mss_now);
+
+	/* react to a specific lost skb (optional) */
+	void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb);
 
 	/* call when packets are delivered to update cwnd and pacing rate,
 	 * after all the ca_state processing. (optional)
@@ -1226,6 +1260,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
 }
 #endif
 
+static inline bool tcp_ca_wants_ce_events(const struct sock *sk)
+{
+	const struct inet_connection_sock *icsk = inet_csk(sk);
+
+	return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN |
+					   TCP_CONG_WANTS_CE_EVENTS);
+}
+
 static inline bool tcp_ca_needs_ecn(const struct sock *sk)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1245,6 +1287,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
 void tcp_set_ca_state(struct sock *sk, const u8 ca_state);
 
 /* From tcp_rate.c */
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb);
 void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb);
 void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 			    struct rate_sample *rs);
@@ -1257,6 +1300,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
 }
 
+/* If a retransmit failed due to local qdisc congestion or other local issues,
+ * then we may have called tcp_set_skb_tso_segs() to increase the number of
+ * segments in the skb without increasing the tx.in_flight. In all other cases,
+ * the tx.in_flight should be at least as big as the pcount of the sk_buff.  We
+ * do not have the state to know whether a retransmit failed due to local qdisc
+ * congestion or other local issues, so to avoid spurious warnings we consider
+ * that any skb marked lost may have suffered that fate.
+ */
+static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount,
+						      u32 skb_sacked_flags,
+						      u32 tx_in_flight)
+{
+	return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST);
+}
+
 /* These functions determine how the current flow behaves in respect of SACK
  * handling. SACK is negotiated with the peer, and therefore it can vary
  * between different flows.
@@ -2419,7 +2477,7 @@ struct tcp_plb_state {
 	u8	consec_cong_rounds:5, /* consecutive congested rounds */
 		unused:3;
 	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
-};
+} __attribute__ ((__packed__));
 
 static inline void tcp_plb_init(const struct sock *sk,
 				struct tcp_plb_state *plb)
diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
index 50655de04c9b..82f8bd8f0d16 100644
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -229,6 +229,29 @@ struct tcp_bbr_info {
 	__u32	bbr_min_rtt;		/* min-filtered RTT in uSec */
 	__u32	bbr_pacing_gain;	/* pacing gain shifted left 8 bits */
 	__u32	bbr_cwnd_gain;		/* cwnd gain shifted left 8 bits */
+	__u32	bbr_bw_hi_lsb;		/* lower 32 bits of bw_hi */
+	__u32	bbr_bw_hi_msb;		/* upper 32 bits of bw_hi */
+	__u32	bbr_bw_lo_lsb;		/* lower 32 bits of bw_lo */
+	__u32	bbr_bw_lo_msb;		/* upper 32 bits of bw_lo */
+	__u8	bbr_mode;		/* current bbr_mode in state machine */
+	__u8	bbr_phase;		/* current state machine phase */
+	__u8	unused1;		/* alignment padding; not used yet */
+	__u8	bbr_version;		/* BBR algorithm version */
+	__u32	bbr_inflight_lo;	/* lower short-term data volume bound */
+	__u32	bbr_inflight_hi;	/* higher long-term data volume bound */
+	__u32	bbr_extra_acked;	/* max excess packets ACKed in epoch */
+};
+
+/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */
+enum tcp_bbr_phase {
+	BBR_PHASE_INVALID		= 0,
+	BBR_PHASE_STARTUP		= 1,
+	BBR_PHASE_DRAIN			= 2,
+	BBR_PHASE_PROBE_RTT		= 3,
+	BBR_PHASE_PROBE_BW_UP		= 4,
+	BBR_PHASE_PROBE_BW_DOWN		= 5,
+	BBR_PHASE_PROBE_BW_CRUISE	= 6,
+	BBR_PHASE_PROBE_BW_REFILL	= 7,
 };
 
 union tcp_cc_info {
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 3b687d20c9ed..a7c30c243b54 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -507,12 +507,14 @@ enum {
 #define RTAX_FEATURE_TIMESTAMP		(1 << 2) /* unused */
 #define RTAX_FEATURE_ALLFRAG		(1 << 3) /* unused */
 #define RTAX_FEATURE_TCP_USEC_TS	(1 << 4)
+#define RTAX_FEATURE_ECN_LOW		(1 << 5)
 
 #define RTAX_FEATURE_MASK	(RTAX_FEATURE_ECN |		\
 				 RTAX_FEATURE_SACK |		\
 				 RTAX_FEATURE_TIMESTAMP |	\
 				 RTAX_FEATURE_ALLFRAG |		\
-				 RTAX_FEATURE_TCP_USEC_TS)
+				 RTAX_FEATURE_TCP_USEC_TS |	\
+				 RTAX_FEATURE_ECN_LOW)
 
 struct rta_session {
 	__u8	proto;
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index dbf896f3146c..4702cd2f1ffc 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail {
 #define TCPI_OPT_ECN_SEEN	16 /* we received at least one packet with ECT */
 #define TCPI_OPT_SYN_DATA	32 /* SYN-ACK acked data in SYN sent or rcvd */
 #define TCPI_OPT_USEC_TS	64 /* usec timestamps */
+#define TCPI_OPT_ECN_LOW	128 /* Low-latency ECN configured at init */
 
 /*
  * Sender's congestion state indicating normal or abnormal situations
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 8e94ed7c56a0..50dc9970cad2 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -668,15 +668,18 @@ config TCP_CONG_BBR
 	default n
 	help
 
-	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
-	  maximize network utilization and minimize queues. It builds an explicit
-	  model of the bottleneck delivery rate and path round-trip propagation
-	  delay. It tolerates packet loss and delay unrelated to congestion. It
-	  can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
-	  coexist with flows that use loss-based congestion control, and can
-	  operate with shallow buffers, deep buffers, bufferbloat, policers, or
-	  AQM schemes that do not provide a delay signal. It requires the fq
-	  ("Fair Queue") pacing packet scheduler.
+	  BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a
+	  model-based congestion control algorithm that aims to maximize
+	  network utilization, keep queues and retransmit rates low, and to be
+	  able to coexist with Reno/CUBIC in common scenarios. It builds an
+	  explicit model of the network path.  It tolerates a targeted degree
+	  of random packet loss and delay. It can operate over LAN, WAN,
+	  cellular, wifi, or cable modem links, and can use shallow-threshold
+	  ECN signals. It can coexist to some degree with flows that use
+	  loss-based congestion control, and can operate with shallow buffers,
+	  deep buffers, bufferbloat, policers, or AQM schemes that do not
+	  provide a delay signal. It requires pacing, using either TCP internal
+	  pacing or the fq ("Fair Queue") pacing packet scheduler.
 
 choice
 	prompt "Default TCP congestion control"
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 18227757ec0c..f180befc28bd 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -305,11 +305,15 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp
 {
 }
 
-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
+static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now)
 {
 	return 0;
 }
 
+static void bpf_tcp_ca_skb_marked_lost(struct sock *sk, const struct sk_buff *skb)
+{
+}
+
 static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag,
 				    const struct rate_sample *rs)
 {
@@ -340,7 +344,8 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
 	.cwnd_event = bpf_tcp_ca_cwnd_event,
 	.in_ack_event = bpf_tcp_ca_in_ack_event,
 	.pkts_acked = bpf_tcp_ca_pkts_acked,
-	.min_tso_segs = bpf_tcp_ca_min_tso_segs,
+	.tso_segs = bpf_tcp_ca_tso_segs,
+	.skb_marked_lost = bpf_tcp_ca_skb_marked_lost,
 	.cong_control = bpf_tcp_ca_cong_control,
 	.undo_cwnd = bpf_tcp_ca_undo_cwnd,
 	.sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2edbd5b181e2..564fc423eb07 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3120,6 +3120,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->rx_opt.dsack = 0;
 	tp->rx_opt.num_sacks = 0;
 	tp->rcv_ooopack = 0;
+	tp->fast_ack_mode = 0;
 
 
 	/* Clean up fastopen related fields */
@@ -3846,6 +3847,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 		info->tcpi_options |= TCPI_OPT_ECN;
 	if (tp->ecn_flags & TCP_ECN_SEEN)
 		info->tcpi_options |= TCPI_OPT_ECN_SEEN;
+	if (tp->ecn_flags & TCP_ECN_LOW)
+		info->tcpi_options |= TCPI_OPT_ECN_LOW;
 	if (tp->syn_data_acked)
 		info->tcpi_options |= TCPI_OPT_SYN_DATA;
 	if (tp->tcp_usec_ts)
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 760941e55153..a180fa648d5e 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -1,18 +1,19 @@
-/* Bottleneck Bandwidth and RTT (BBR) congestion control
+/* BBR (Bottleneck Bandwidth and RTT) congestion control
  *
- * BBR congestion control computes the sending rate based on the delivery
- * rate (throughput) estimated from ACKs. In a nutshell:
+ * BBR is a model-based congestion control algorithm that aims for low queues,
+ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the
+ * network path, it uses measurements of bandwidth and RTT, as well as (if they
+ * occur) packet loss and/or shallow-threshold ECN signals. Note that although
+ * it can use ECN or loss signals explicitly, it does not require either; it
+ * can bound its in-flight data based on its estimate of the BDP.
  *
- *   On each ACK, update our model of the network path:
- *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
- *      min_rtt = windowed_min(rtt, 10 seconds)
- *   pacing_rate = pacing_gain * bottleneck_bandwidth
- *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
- *
- * The core algorithm does not react directly to packet losses or delays,
- * although BBR may adjust the size of next send per ACK when loss is
- * observed, or adjust the sending rate if it estimates there is a
- * traffic policer, in order to keep the drop rate reasonable.
+ * The model has both higher and lower bounds for the operating range:
+ *   lo: bw_lo, inflight_lo: conservative short-term lower bound
+ *   hi: bw_hi, inflight_hi: robust long-term upper bound
+ * The bandwidth-probing time scale is (a) extended dynamically based on
+ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by
+ * an interactive wall-clock time-scale to be more scalable and responsive
+ * than Reno and CUBIC.
  *
  * Here is a state transition diagram for BBR:
  *
@@ -65,6 +66,13 @@
 #include <linux/random.h>
 #include <linux/win_minmax.h>
 
+#include <trace/events/tcp.h>
+#include "tcp_dctcp.h"
+
+#define BBR_VERSION		3
+
+#define bbr_param(sk,name)	(bbr_ ## name)
+
 /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
  * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
  * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
@@ -85,36 +93,41 @@ enum bbr_mode {
 	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
 };
 
+/* How does the incoming ACK stream relate to our bandwidth probing? */
+enum bbr_ack_phase {
+	BBR_ACKS_INIT,		  /* not probing; not getting probe feedback */
+	BBR_ACKS_REFILLING,	  /* sending at est. bw to fill pipe */
+	BBR_ACKS_PROBE_STARTING,  /* inflight rising to probe bw */
+	BBR_ACKS_PROBE_FEEDBACK,  /* getting feedback from bw probing */
+	BBR_ACKS_PROBE_STOPPING,  /* stopped probing; still getting feedback */
+};
+
 /* BBR congestion control block */
 struct bbr {
 	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
 	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
 	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
-	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
-	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
+	u32	probe_rtt_min_us;	/* min RTT in probe_rtt_win_ms win */
+	u32	probe_rtt_min_stamp;	/* timestamp of probe_rtt_min_us*/
 	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
 	u64	cycle_mstamp;	     /* time of this cycle phase start */
-	u32     mode:3,		     /* current bbr_mode in state machine */
+	u32     mode:2,		     /* current bbr_mode in state machine */
 		prev_ca_state:3,     /* CA state on previous ACK */
-		packet_conservation:1,  /* use packet conservation? */
 		round_start:1,	     /* start of packet-timed tx->ack round? */
+		ce_state:1,          /* If most recent data has CE bit set */
+		bw_probe_up_rounds:5,   /* cwnd-limited rounds in PROBE_UP */
+		try_fast_path:1,	/* can we take fast path? */
 		idle_restart:1,	     /* restarting after idle? */
 		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
-		unused:13,
-		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
-		lt_rtt_cnt:7,	     /* round trips in long-term interval */
-		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
-	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
-	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
-	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
-	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
+		init_cwnd:7,         /* initial cwnd */
+		unused_1:10;
 	u32	pacing_gain:10,	/* current gain for setting pacing rate */
 		cwnd_gain:10,	/* current gain for setting cwnd */
 		full_bw_reached:1,   /* reached full bw in Startup? */
 		full_bw_cnt:2,	/* number of rounds without large bw gains */
-		cycle_idx:3,	/* current index in pacing_gain cycle array */
+		cycle_idx:2,	/* current index in pacing_gain cycle array */
 		has_seen_rtt:1, /* have we seen an RTT sample yet? */
-		unused_b:5;
+		unused_2:6;
 	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
 	u32	full_bw;	/* recent bw, to estimate if pipe is full */
 
@@ -124,19 +137,67 @@ struct bbr {
 	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
 		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
 		extra_acked_win_idx:1,	/* current index in extra_acked array */
-		unused_c:6;
+	/* BBR v3 state: */
+		full_bw_now:1,		/* recently reached full bw plateau? */
+		startup_ecn_rounds:2,	/* consecutive hi ECN STARTUP rounds */
+		loss_in_cycle:1,	/* packet loss in this cycle? */
+		ecn_in_cycle:1,		/* ECN in this cycle? */
+		unused_3:1;
+	u32	loss_round_delivered; /* scb->tx.delivered ending loss round */
+	u32	undo_bw_lo;	     /* bw_lo before latest losses */
+	u32	undo_inflight_lo;    /* inflight_lo before latest losses */
+	u32	undo_inflight_hi;    /* inflight_hi before latest losses */
+	u32	bw_latest;	 /* max delivered bw in last round trip */
+	u32	bw_lo;		 /* lower bound on sending bandwidth */
+	u32	bw_hi[2];	 /* max recent measured bw sample */
+	u32	inflight_latest; /* max delivered data in last round trip */
+	u32	inflight_lo;	 /* lower bound of inflight data range */
+	u32	inflight_hi;	 /* upper bound of inflight data range */
+	u32	bw_probe_up_cnt; /* packets delivered per inflight_hi incr */
+	u32	bw_probe_up_acks;  /* packets (S)ACKed since inflight_hi incr */
+	u32	probe_wait_us;	 /* PROBE_DOWN until next clock-driven probe */
+	u32	prior_rcv_nxt;	/* tp->rcv_nxt when CE state last changed */
+	u32	ecn_eligible:1,	/* sender can use ECN (RTT, handshake)? */
+		ecn_alpha:9,	/* EWMA delivered_ce/delivered; 0..256 */
+		bw_probe_samples:1,    /* rate samples reflect bw probing? */
+		prev_probe_too_high:1, /* did last PROBE_UP go too high? */
+		stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */
+		rounds_since_probe:8,  /* packet-timed rounds since probed bw */
+		loss_round_start:1,    /* loss_round_delivered round trip? */
+		loss_in_round:1,       /* loss marked in this round trip? */
+		ecn_in_round:1,	       /* ECN marked in this round trip? */
+		ack_phase:3,	       /* bbr_ack_phase: meaning of ACKs */
+		loss_events_in_round:4,/* losses in STARTUP round */
+		initialized:1;	       /* has bbr_init() been called? */
+	u32	alpha_last_delivered;	 /* tp->delivered    at alpha update */
+	u32	alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */
+
+	u8	unused_4;		/* to preserve alignment */
+	struct tcp_plb_state plb;
 };
 
-#define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
+struct bbr_context {
+	u32 sample_bw;
+};
 
-/* Window length of bw filter (in rounds): */
-static const int bbr_bw_rtts = CYCLE_LEN + 2;
 /* Window length of min_rtt filter (in sec): */
 static const u32 bbr_min_rtt_win_sec = 10;
 /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
 static const u32 bbr_probe_rtt_mode_ms = 200;
-/* Skip TSO below the following bandwidth (bits/sec): */
-static const int bbr_min_tso_rate = 1200000;
+/* Window length of probe_rtt_min_us filter (in ms), and consequently the
+ * typical interval between PROBE_RTT mode entries. The default is 5000ms.
+ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC
+ */
+static const u32 bbr_probe_rtt_win_ms = 5000;
+/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */
+static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2;
+
+/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
+ * in bigger TSO bursts. We cut the RTT-based allowance in half
+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
+ */
+static const u32 bbr_tso_rtt_shift = 9;
 
 /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
  * In order to help drive the network toward lower queues and low latency while
@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000;
  */
 static const int bbr_pacing_margin_percent = 1;
 
-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
+/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value
  * that will allow a smoothly increasing pacing rate that will double each RTT
  * and send the same number of packets per RTT that an un-paced, slow-starting
  * Reno or CUBIC flow would:
  */
-static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
+static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1;
+/* The gain for deriving startup cwnd: */
+static const int bbr_startup_cwnd_gain = BBR_UNIT * 2;
+/* The pacing gain in BBR_DRAIN is calculated to typically drain
  * the queue created in BBR_STARTUP in a single round:
  */
 static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
 static const int bbr_cwnd_gain  = BBR_UNIT * 2;
 /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
 static const int bbr_pacing_gain[] = {
-	BBR_UNIT * 5 / 4,	/* probe for more available bw */
-	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
-	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
-	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
+	BBR_UNIT * 5 / 4,	/* UP: probe for more available bw */
+	BBR_UNIT * 91 / 100,	/* DOWN: drain queue and/or yield bw */
+	BBR_UNIT,		/* CRUISE: try to use pipe w/ some headroom */
+	BBR_UNIT,		/* REFILL: refill pipe to estimated 100% */
+};
+enum bbr_pacing_gain_phase {
+	BBR_BW_PROBE_UP		= 0,  /* push up inflight to probe for bw/vol */
+	BBR_BW_PROBE_DOWN	= 1,  /* drain excess inflight from the queue */
+	BBR_BW_PROBE_CRUISE	= 2,  /* use pipe, w/ headroom in queue/pipe */
+	BBR_BW_PROBE_REFILL	= 3,  /* v2: refill the pipe again to 100% */
 };
-/* Randomize the starting gain cycling phase over N phases: */
-static const u32 bbr_cycle_rand = 7;
 
 /* Try to keep at least this many packets in flight, if things go smoothly. For
  * smooth functioning, a sliding window protocol ACKing every other packet
@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7;
  */
 static const u32 bbr_cwnd_min_target = 4;
 
-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
+/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */
 /* If bw has increased significantly (1.25x), there may be more bw available: */
 static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
 /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
 static const u32 bbr_full_bw_cnt = 3;
 
-/* "long-term" ("LT") bandwidth estimator parameters... */
-/* The minimum number of rounds in an LT bw sampling interval: */
-static const u32 bbr_lt_intvl_min_rtts = 4;
-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
-static const u32 bbr_lt_loss_thresh = 50;
-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
-static const u32 bbr_lt_bw_diff = 4000 / 8;
-/* If we estimate we're policed, use lt_bw for this many round trips: */
-static const u32 bbr_lt_bw_max_rtts = 48;
-
 /* Gain factor for adding extra_acked to target cwnd: */
 static const int bbr_extra_acked_gain = BBR_UNIT;
 /* Window length of extra_acked window. */
@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
 /* Time period for clamping cwnd increment due to ack aggregation */
 static const u32 bbr_extra_acked_max_us = 100 * 1000;
 
+/* Flags to control BBR ECN-related behavior... */
+
+/* Ensure ACKs only ACK packets with consistent ECN CE status? */
+static const bool bbr_precise_ece_ack = true;
+
+/* Max RTT (in usec) at which to use sender-side ECN logic.
+ * Disabled when 0 (ECN allowed at any RTT).
+ */
+static const u32 bbr_ecn_max_rtt_us = 5000;
+
+/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE.
+ * No loss response when 0.
+ */
+static const u32 bbr_beta = BBR_UNIT * 30 / 100;
+
+/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */
+static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16;
+
+/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly
+ * to congestion if the bottleneck is congested when the flow starts up.
+ */
+static const u32 bbr_ecn_alpha_init = BBR_UNIT;
+
+/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE.
+ * No ECN based bounding when 0.
+ */
+static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3;	 /* 1/3 = 33% */
+
+/* Estimate bw probing has gone too far if CE ratio exceeds this threshold.
+ * Scaled by BBR_SCALE. Disabled when 0.
+ */
+static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2;  /* 1/2 = 50% */
+
+/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN
+ * clears then make the first round's increment to inflight_hi the following
+ * fraction of inflight_hi.
+ */
+static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2;
+
+/* Estimate bw probing has gone too far if loss rate exceeds this level. */
+static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100;  /* 2% loss */
+
+/* Slow down for a packet loss recovered by TLP? */
+static const bool bbr_loss_probe_recovery = true;
+
+/* Exit STARTUP if number of loss marking events in a Recovery round is >= N,
+ * and loss rate is higher than bbr_loss_thresh.
+ * Disabled if 0.
+ */
+static const u32 bbr_full_loss_cnt = 6;
+
+/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh
+ * meets this count.
+ */
+static const u32 bbr_full_ecn_cnt = 2;
+
+/* Fraction of unutilized headroom to try to leave in path upon high loss. */
+static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100;
+
+/* How much do we increase cwnd_gain when probing for bandwidth in
+ * BBR_BW_PROBE_UP? This specifies the increment in units of
+ * BBR_UNIT/4. The default is 1, meaning 0.25.
+ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75).
+ */
+static const u32 bbr_bw_probe_cwnd_gain = 1;
+
+/* Max number of packet-timed rounds to wait before probing for bandwidth.  If
+ * we want to tolerate 1% random loss per round, and not have this cut our
+ * inflight too much, we must probe for bw periodically on roughly this scale.
+ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance.
+ * We aim to be fair with Reno/CUBIC up to a BDP of at least:
+ *  BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
+ */
+static const u32 bbr_bw_probe_max_rounds = 63;
+
+/* Max amount of randomness to inject in round counting for Reno-coexistence.
+ */
+static const u32 bbr_bw_probe_rand_rounds = 2;
+
+/* Use BBR-native probe time scale starting at this many usec.
+ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least:
+ *  BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs
+ */
+static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC;  /* 2 secs */
+
+/* Use BBR-native probes spread over this many usec: */
+static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC;  /* 1 secs */
+
+/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */
+static const bool bbr_fast_path = true;
+
+/* Use fast ack mode? */
+static const bool bbr_fast_ack_mode = true;
+
+static u32 bbr_max_bw(const struct sock *sk);
+static u32 bbr_bw(const struct sock *sk);
+static void bbr_exit_probe_rtt(struct sock *sk);
+static void bbr_reset_congestion_signals(struct sock *sk);
+static void bbr_run_loss_probe_recovery(struct sock *sk);
+
 static void bbr_check_probe_rtt_done(struct sock *sk);
 
+/* This connection can use ECN if both endpoints have signaled ECN support in
+ * the handshake and the per-route settings indicated this is a
+ * shallow-threshold ECN environment, meaning both:
+ *  (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and
+ *  (b) TCP endpoints provide precise ACKs that only ACK data segments
+ *      with consistent ECN CE status
+ */
+static bool bbr_can_use_ecn(const struct sock *sk)
+{
+	return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) &&
+	       (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW);
+}
+
 /* Do we estimate that STARTUP filled the pipe? */
 static bool bbr_full_bw_reached(const struct sock *sk)
 {
@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk)
 /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
 static u32 bbr_max_bw(const struct sock *sk)
 {
-	struct bbr *bbr = inet_csk_ca(sk);
+	const struct bbr *bbr = inet_csk_ca(sk);
 
-	return minmax_get(&bbr->bw);
+	return max(bbr->bw_hi[0], bbr->bw_hi[1]);
 }
 
 /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
 static u32 bbr_bw(const struct sock *sk)
 {
-	struct bbr *bbr = inet_csk_ca(sk);
+	const struct bbr *bbr = inet_csk_ca(sk);
 
-	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
+	return min(bbr_max_bw(sk), bbr->bw_lo);
 }
 
 /* Return maximum extra acked in past k-2k round trips,
@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk)
  * The order here is chosen carefully to avoid overflow of u64. This should
  * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
  */
-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain,
+				  int margin)
 {
 	unsigned int mss = tcp_sk(sk)->mss_cache;
 
 	rate *= mss;
 	rate *= gain;
 	rate >>= BBR_SCALE;
-	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
-	return rate >> BW_SCALE;
+	rate *= USEC_PER_SEC / 100 * (100 - margin);
+	rate >>= BW_SCALE;
+	rate = max(rate, 1ULL);
+	return rate;
+}
+
+static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate)
+{
+	return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0);
 }
 
 /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
 {
 	u64 rate = bw;
 
-	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
+	rate = bbr_rate_bytes_per_sec(sk, rate, gain,
+				      bbr_pacing_margin_percent);
 	rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
 	return rate;
 }
 
-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
+/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */
 static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -279,7 +456,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
 	bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
 	do_div(bw, rtt_us);
 	WRITE_ONCE(sk->sk_pacing_rate,
-		   bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
+		   bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain)));
 }
 
 /* Pace using current bw estimate and a gain factor. */
@@ -295,26 +472,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
 		WRITE_ONCE(sk->sk_pacing_rate, rate);
 }
 
-/* override sysctl_tcp_min_tso_segs */
-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
+/* Return the number of segments BBR would like in a TSO/GSO skb, given a
+ * particular max gso size as a constraint. TODO: make this simpler and more
+ * consistent by switching bbr to just call tcp_tso_autosize().
+ */
+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now,
+				u32 gso_max_size)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 segs, r;
+	u64 bytes;
+
+	/* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */
+	bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
+
+	/* Budget a TSO/GSO burst size allowance based on min_rtt. For every
+	 * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst.
+	 * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K)
+	 */
+	if (bbr_param(sk, tso_rtt_shift)) {
+		r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift);
+		if (r < BITS_PER_TYPE(u32))   /* prevent undefined behavior */
+			bytes += GSO_LEGACY_MAX_SIZE >> r;
+	}
+
+	bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER);
+	segs = max_t(u32, bytes / mss_now,
+		     sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+	return segs;
+}
+
+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */
+__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now)
 {
-	return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
+	return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size);
 }
 
+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */
 static u32 bbr_tso_segs_goal(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	u32 segs, bytes;
-
-	/* Sort of tcp_tso_autosize() but ignoring
-	 * driver provided sk_gso_max_size.
-	 */
-	bytes = min_t(unsigned long,
-		      READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
-		      GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
-	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
 
-	return min(segs, 0x7FU);
+	return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE);
 }
 
 /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
@@ -334,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
 
-	if (event == CA_EVENT_TX_START && tp->app_limited) {
+	if (event == CA_EVENT_TX_START) {
+		if (!tp->app_limited)
+			return;
 		bbr->idle_restart = 1;
 		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
 		bbr->ack_epoch_acked = 0;
@@ -345,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
 			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
 		else if (bbr->mode == BBR_PROBE_RTT)
 			bbr_check_probe_rtt_done(sk);
+	} else if ((event == CA_EVENT_ECN_IS_CE ||
+		    event == CA_EVENT_ECN_NO_CE) &&
+		   bbr_can_use_ecn(sk) &&
+		   bbr_param(sk, precise_ece_ack)) {
+		u32 state = bbr->ce_state;
+		dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state);
+		bbr->ce_state = state;
+	} else if (event == CA_EVENT_TLP_RECOVERY &&
+		   bbr_param(sk, loss_probe_recovery)) {
+		bbr_run_loss_probe_recovery(sk);
 	}
 }
 
@@ -367,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
 	 * default. This should only happen when the connection is not using TCP
 	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
 	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
-	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
+	 * case we need to slow-start up toward something safe: initial cwnd.
 	 */
 	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
-		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
+		return bbr->init_cwnd;  /* be safe: cap at initial cwnd */
 
 	w = (u64)bw * bbr->min_rtt_us;
 
@@ -387,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
  *   - one skb in sending host Qdisc,
  *   - one skb in sending host TSO/GSO engine
  *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * Don't worry, at low rates this won't bloat cwnd because
+ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets,
  * which allows 2 outstanding 2-packet sequences, to try to keep pipe
  * full even with ACK-every-other-packet delayed ACKs.
  */
 static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
 {
 	struct bbr *bbr = inet_csk_ca(sk);
+	u32 tso_segs_goal;
 
-	/* Allow enough full-sized skbs in flight to utilize end systems. */
-	cwnd += 3 * bbr_tso_segs_goal(sk);
-
-	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
-	cwnd = (cwnd + 1) & ~1U;
+	tso_segs_goal = 3 * bbr_tso_segs_goal(sk);
 
+	/* Allow enough full-sized skbs in flight to utilize end systems. */
+	cwnd = max_t(u32, cwnd, tso_segs_goal);
+	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
 	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
-	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
+	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
 		cwnd += 2;
 
 	return cwnd;
@@ -458,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
 {
 	u32 max_aggr_cwnd, aggr_cwnd = 0;
 
-	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
+	if (bbr_param(sk, extra_acked_gain)) {
 		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
 				/ BW_UNIT;
-		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
+		aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk))
 			     >> BBR_SCALE;
 		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
 	}
@@ -469,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
 	return aggr_cwnd;
 }
 
-/* An optimization in BBR to reduce losses: On the first round of recovery, we
- * follow the packet conservation principle: send P packets per P packets acked.
- * After that, we slow-start and send at most 2*P packets per P packets acked.
- * After recovery finishes, or upon undo, we restore the cwnd we had when
- * recovery started (capped by the target cwnd based on estimated BDP).
- *
- * TODO(ycheng/ncardwell): implement a rate-based approach.
- */
-static bool bbr_set_cwnd_to_recover_or_restore(
-	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
+/* Returns the cwnd for PROBE_RTT mode. */
+static u32 bbr_probe_rtt_cwnd(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct bbr *bbr = inet_csk_ca(sk);
-	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
-	u32 cwnd = tcp_snd_cwnd(tp);
-
-	/* An ACK for P pkts should release at most 2*P packets. We do this
-	 * in two steps. First, here we deduct the number of lost packets.
-	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
-	 */
-	if (rs->losses > 0)
-		cwnd = max_t(s32, cwnd - rs->losses, 1);
-
-	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
-		/* Starting 1st round of Recovery, so do packet conservation. */
-		bbr->packet_conservation = 1;
-		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
-		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
-		cwnd = tcp_packets_in_flight(tp) + acked;
-	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
-		/* Exiting loss recovery; restore cwnd saved before recovery. */
-		cwnd = max(cwnd, bbr->prior_cwnd);
-		bbr->packet_conservation = 0;
-	}
-	bbr->prev_ca_state = state;
-
-	if (bbr->packet_conservation) {
-		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
-		return true;	/* yes, using packet conservation */
-	}
-	*new_cwnd = cwnd;
-	return false;
+	return max_t(u32, bbr_param(sk, cwnd_min_target),
+		     bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain)));
 }
 
 /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
  * has drawn us down below target), or snap down to target if we're above it.
  */
 static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
-			 u32 acked, u32 bw, int gain)
+			 u32 acked, u32 bw, int gain, u32 cwnd,
+			 struct bbr_context *ctx)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
-	u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
+	u32 target_cwnd = 0;
 
 	if (!acked)
 		goto done;  /* no packet fully ACKed; just apply caps */
 
-	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
-		goto done;
-
 	target_cwnd = bbr_bdp(sk, bw, gain);
 
 	/* Increment the cwnd to account for excess ACKed data that seems
@@ -537,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
 	target_cwnd += bbr_ack_aggregation_cwnd(sk);
 	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
 
-	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
-	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
-		cwnd = min(cwnd + acked, target_cwnd);
-	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
-		cwnd = cwnd + acked;
-	cwnd = max(cwnd, bbr_cwnd_min_target);
+	/* Update cwnd and enable fast path if cwnd reaches target_cwnd. */
+	bbr->try_fast_path = 0;
+	if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */
+		cwnd += acked;
+		if (cwnd >= target_cwnd) {
+			cwnd = target_cwnd;
+			bbr->try_fast_path = 1;
+		}
+	} else if (cwnd < target_cwnd || cwnd  < 2 * bbr->init_cwnd) {
+		cwnd += acked;
+	} else {
+		bbr->try_fast_path = 1;
+	}
 
+	cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target));
 done:
-	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));	/* apply global cap */
+	tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));  /* global cap */
 	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
-		tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
-}
-
-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
-static bool bbr_is_next_cycle_phase(struct sock *sk,
-				    const struct rate_sample *rs)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct bbr *bbr = inet_csk_ca(sk);
-	bool is_full_length =
-		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
-		bbr->min_rtt_us;
-	u32 inflight, bw;
-
-	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
-	 * use the pipe without increasing the queue.
-	 */
-	if (bbr->pacing_gain == BBR_UNIT)
-		return is_full_length;		/* just use wall clock time */
-
-	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
-	bw = bbr_max_bw(sk);
-
-	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
-	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
-	 * small (e.g. on a LAN). We do not persist if packets are lost, since
-	 * a path with small buffers may not hold that much.
-	 */
-	if (bbr->pacing_gain > BBR_UNIT)
-		return is_full_length &&
-			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
-			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
-
-	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
-	 * probing didn't find more bw. If inflight falls to match BDP then we
-	 * estimate queue is drained; persisting would underutilize the pipe.
-	 */
-	return is_full_length ||
-		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
-}
-
-static void bbr_advance_cycle_phase(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct bbr *bbr = inet_csk_ca(sk);
-
-	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
-	bbr->cycle_mstamp = tp->delivered_mstamp;
-}
-
-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
-static void bbr_update_cycle_phase(struct sock *sk,
-				   const struct rate_sample *rs)
-{
-	struct bbr *bbr = inet_csk_ca(sk);
-
-	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
-		bbr_advance_cycle_phase(sk);
+		tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp),
+					   bbr_probe_rtt_cwnd(sk)));
 }
 
 static void bbr_reset_startup_mode(struct sock *sk)
@@ -614,191 +738,49 @@ static void bbr_reset_startup_mode(struct sock *sk)
 	bbr->mode = BBR_STARTUP;
 }
 
-static void bbr_reset_probe_bw_mode(struct sock *sk)
-{
-	struct bbr *bbr = inet_csk_ca(sk);
-
-	bbr->mode = BBR_PROBE_BW;
-	bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
-	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
-}
-
-static void bbr_reset_mode(struct sock *sk)
-{
-	if (!bbr_full_bw_reached(sk))
-		bbr_reset_startup_mode(sk);
-	else
-		bbr_reset_probe_bw_mode(sk);
-}
-
-/* Start a new long-term sampling interval. */
-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct bbr *bbr = inet_csk_ca(sk);
-
-	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
-	bbr->lt_last_delivered = tp->delivered;
-	bbr->lt_last_lost = tp->lost;
-	bbr->lt_rtt_cnt = 0;
-}
-
-/* Completely reset long-term bandwidth sampling. */
-static void bbr_reset_lt_bw_sampling(struct sock *sk)
-{
-	struct bbr *bbr = inet_csk_ca(sk);
-
-	bbr->lt_bw = 0;
-	bbr->lt_use_bw = 0;
-	bbr->lt_is_sampling = false;
-	bbr_reset_lt_bw_sampling_interval(sk);
-}
-
-/* Long-term bw sampling interval is done. Estimate whether we're policed. */
-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
-{
-	struct bbr *bbr = inet_csk_ca(sk);
-	u32 diff;
-
-	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
-		/* Is new bw close to the lt_bw from the previous interval? */
-		diff = abs(bw - bbr->lt_bw);
-		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
-		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
-		     bbr_lt_bw_diff)) {
-			/* All criteria are met; estimate we're policed. */
-			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
-			bbr->lt_use_bw = 1;
-			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
-			bbr->lt_rtt_cnt = 0;
-			return;
-		}
-	}
-	bbr->lt_bw = bw;
-	bbr_reset_lt_bw_sampling_interval(sk);
-}
-
-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
- * explicitly models their policed rate, to reduce unnecessary losses. We
- * estimate that we're policed if we see 2 consecutive sampling intervals with
- * consistent throughput and high packet loss. If we think we're being policed,
- * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
+/* See if we have reached next round trip. Upon start of the new round,
+ * returns packets delivered since previous round start plus this ACK.
  */
-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct bbr *bbr = inet_csk_ca(sk);
-	u32 lost, delivered;
-	u64 bw;
-	u32 t;
-
-	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
-		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
-		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
-			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
-			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
-		}
-		return;
-	}
-
-	/* Wait for the first loss before sampling, to let the policer exhaust
-	 * its tokens and estimate the steady-state rate allowed by the policer.
-	 * Starting samples earlier includes bursts that over-estimate the bw.
-	 */
-	if (!bbr->lt_is_sampling) {
-		if (!rs->losses)
-			return;
-		bbr_reset_lt_bw_sampling_interval(sk);
-		bbr->lt_is_sampling = true;
-	}
-
-	/* To avoid underestimates, reset sampling if we run out of data. */
-	if (rs->is_app_limited) {
-		bbr_reset_lt_bw_sampling(sk);
-		return;
-	}
-
-	if (bbr->round_start)
-		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
-	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
-		return;		/* sampling interval needs to be longer */
-	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
-		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
-		return;
-	}
-
-	/* End sampling interval when a packet is lost, so we estimate the
-	 * policer tokens were exhausted. Stopping the sampling before the
-	 * tokens are exhausted under-estimates the policed rate.
-	 */
-	if (!rs->losses)
-		return;
-
-	/* Calculate packets lost and delivered in sampling interval. */
-	lost = tp->lost - bbr->lt_last_lost;
-	delivered = tp->delivered - bbr->lt_last_delivered;
-	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
-	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
-		return;
-
-	/* Find average delivery rate in this sampling interval. */
-	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
-	if ((s32)t < 1)
-		return;		/* interval is less than one ms, so wait */
-	/* Check if can multiply without overflow */
-	if (t >= ~0U / USEC_PER_MSEC) {
-		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
-		return;
-	}
-	t *= USEC_PER_MSEC;
-	bw = (u64)delivered * BW_UNIT;
-	do_div(bw, t);
-	bbr_lt_bw_interval_done(sk, bw);
-}
-
-/* Estimate the bandwidth based on how fast packets are delivered */
-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
+static u32 bbr_update_round_start(struct sock *sk,
+		const struct rate_sample *rs, struct bbr_context *ctx)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
-	u64 bw;
+	u32 round_delivered = 0;
 
 	bbr->round_start = 0;
-	if (rs->delivered < 0 || rs->interval_us <= 0)
-		return; /* Not a valid observation */
 
 	/* See if we've reached the next RTT */
-	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+	if (rs->interval_us > 0 &&
+	    !before(rs->prior_delivered, bbr->next_rtt_delivered)) {
+		round_delivered = tp->delivered - bbr->next_rtt_delivered;
 		bbr->next_rtt_delivered = tp->delivered;
-		bbr->rtt_cnt++;
 		bbr->round_start = 1;
-		bbr->packet_conservation = 0;
 	}
+	return round_delivered;
+}
 
-	bbr_lt_bw_sampling(sk, rs);
+/* Calculate the bandwidth based on how fast packets are delivered */
+static void bbr_calculate_bw_sample(struct sock *sk,
+			const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	u64 bw = 0;
 
 	/* Divide delivered by the interval to find a (lower bound) bottleneck
 	 * bandwidth sample. Delivered is in packets and interval_us in uS and
 	 * ratio will be <<1 for most connections. So delivered is first scaled.
+	 * Round up to allow growth at low rates, even with integer division.
 	 */
-	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
-
-	/* If this sample is application-limited, it is likely to have a very
-	 * low delivered count that represents application behavior rather than
-	 * the available network rate. Such a sample could drag down estimated
-	 * bw, causing needless slow-down. Thus, to continue to send at the
-	 * last measured network rate, we filter out app-limited samples unless
-	 * they describe the path bw at least as well as our bw model.
-	 *
-	 * So the goal during app-limited phase is to proceed with the best
-	 * network rate no matter how long. We automatically leave this
-	 * phase when app writes faster than the network can deliver :)
-	 */
-	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
-		/* Incorporate new sample into our max bw filter. */
-		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
+	if (rs->interval_us > 0) {
+		if (WARN_ONCE(rs->delivered < 0,
+			      "negative delivered: %d interval_us: %ld\n",
+			      rs->delivered, rs->interval_us))
+			return;
+
+		bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us);
 	}
+
+	ctx->sample_bw = bw;
 }
 
 /* Estimates the windowed max degree of ack aggregation.
@@ -812,7 +794,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
  *
  * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
  * Max filter is an approximate sliding window of 5-10 (packet timed) round
- * trips.
+ * trips for non-startup phase, and 1-2 round trips for startup.
  */
 static void bbr_update_ack_aggregation(struct sock *sk,
 				       const struct rate_sample *rs)
@@ -820,15 +802,19 @@ static void bbr_update_ack_aggregation(struct sock *sk,
 	u32 epoch_us, expected_acked, extra_acked;
 	struct bbr *bbr = inet_csk_ca(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts);
 
-	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
+	if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 ||
 	    rs->delivered < 0 || rs->interval_us <= 0)
 		return;
 
 	if (bbr->round_start) {
 		bbr->extra_acked_win_rtts = min(0x1F,
 						bbr->extra_acked_win_rtts + 1);
-		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
+		if (!bbr_full_bw_reached(sk))
+			extra_acked_win_rtts_thresh = 1;
+		if (bbr->extra_acked_win_rtts >=
+		    extra_acked_win_rtts_thresh) {
 			bbr->extra_acked_win_rtts = 0;
 			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
 						   0 : 1;
@@ -862,49 +848,6 @@ static void bbr_update_ack_aggregation(struct sock *sk,
 		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
 }
 
-/* Estimate when the pipe is full, using the change in delivery rate: BBR
- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
- * higher rwin, 3: we get higher delivery rate samples. Or transient
- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
- * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
- */
-static void bbr_check_full_bw_reached(struct sock *sk,
-				      const struct rate_sample *rs)
-{
-	struct bbr *bbr = inet_csk_ca(sk);
-	u32 bw_thresh;
-
-	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
-		return;
-
-	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
-	if (bbr_max_bw(sk) >= bw_thresh) {
-		bbr->full_bw = bbr_max_bw(sk);
-		bbr->full_bw_cnt = 0;
-		return;
-	}
-	++bbr->full_bw_cnt;
-	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
-}
-
-/* If pipe is probably full, drain the queue and then enter steady-state. */
-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
-{
-	struct bbr *bbr = inet_csk_ca(sk);
-
-	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
-		bbr->mode = BBR_DRAIN;	/* drain queue we created */
-		tcp_sk(sk)->snd_ssthresh =
-				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
-	}	/* fall through to check if in-flight is already small: */
-	if (bbr->mode == BBR_DRAIN &&
-	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
-	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
-		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
-}
-
 static void bbr_check_probe_rtt_done(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -914,9 +857,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
 	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
 		return;
 
-	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
+	bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */
 	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
-	bbr_reset_mode(sk);
+	bbr_exit_probe_rtt(sk);
 }
 
 /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
@@ -942,23 +885,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
-	bool filter_expired;
+	bool probe_rtt_expired, min_rtt_expired;
+	u32 expire;
 
-	/* Track min RTT seen in the min_rtt_win_sec filter window: */
-	filter_expired = after(tcp_jiffies32,
-			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
+	/* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */
+	expire = bbr->probe_rtt_min_stamp +
+		 msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms));
+	probe_rtt_expired = after(tcp_jiffies32, expire);
 	if (rs->rtt_us >= 0 &&
-	    (rs->rtt_us < bbr->min_rtt_us ||
-	     (filter_expired && !rs->is_ack_delayed))) {
-		bbr->min_rtt_us = rs->rtt_us;
-		bbr->min_rtt_stamp = tcp_jiffies32;
+	    (rs->rtt_us < bbr->probe_rtt_min_us ||
+	     (probe_rtt_expired && !rs->is_ack_delayed))) {
+		bbr->probe_rtt_min_us = rs->rtt_us;
+		bbr->probe_rtt_min_stamp = tcp_jiffies32;
+	}
+	/* Track min RTT seen in the min_rtt_win_sec filter window: */
+	expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ;
+	min_rtt_expired = after(tcp_jiffies32, expire);
+	if (bbr->probe_rtt_min_us <= bbr->min_rtt_us ||
+	    min_rtt_expired) {
+		bbr->min_rtt_us = bbr->probe_rtt_min_us;
+		bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp;
 	}
 
-	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
+	if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired &&
 	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
 		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
 		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
 		bbr->probe_rtt_done_stamp = 0;
+		bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
+		bbr->next_rtt_delivered = tp->delivered;
 	}
 
 	if (bbr->mode == BBR_PROBE_RTT) {
@@ -967,9 +922,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
 			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
 		/* Maintain min packets in flight for max(200 ms, 1 round). */
 		if (!bbr->probe_rtt_done_stamp &&
-		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
+		    tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) {
 			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
-				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
+				msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms));
 			bbr->probe_rtt_round_done = 0;
 			bbr->next_rtt_delivered = tp->delivered;
 		} else if (bbr->probe_rtt_done_stamp) {
@@ -990,18 +945,20 @@ static void bbr_update_gains(struct sock *sk)
 
 	switch (bbr->mode) {
 	case BBR_STARTUP:
-		bbr->pacing_gain = bbr_high_gain;
-		bbr->cwnd_gain	 = bbr_high_gain;
+		bbr->pacing_gain = bbr_param(sk, startup_pacing_gain);
+		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);
 		break;
 	case BBR_DRAIN:
-		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
-		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
+		bbr->pacing_gain = bbr_param(sk, drain_gain);  /* slow, to drain */
+		bbr->cwnd_gain	 = bbr_param(sk, startup_cwnd_gain);  /* keep cwnd */
 		break;
 	case BBR_PROBE_BW:
-		bbr->pacing_gain = (bbr->lt_use_bw ?
-				    BBR_UNIT :
-				    bbr_pacing_gain[bbr->cycle_idx]);
-		bbr->cwnd_gain	 = bbr_cwnd_gain;
+		bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx];
+		bbr->cwnd_gain	 = bbr_param(sk, cwnd_gain);
+		if (bbr_param(sk, bw_probe_cwnd_gain) &&
+		    bbr->cycle_idx == BBR_BW_PROBE_UP)
+			bbr->cwnd_gain +=
+				BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4;
 		break;
 	case BBR_PROBE_RTT:
 		bbr->pacing_gain = BBR_UNIT;
@@ -1013,144 +970,1387 @@ static void bbr_update_gains(struct sock *sk)
 	}
 }
 
-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
+__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
 {
-	bbr_update_bw(sk, rs);
-	bbr_update_ack_aggregation(sk, rs);
-	bbr_update_cycle_phase(sk, rs);
-	bbr_check_full_bw_reached(sk, rs);
-	bbr_check_drain(sk, rs);
-	bbr_update_min_rtt(sk, rs);
-	bbr_update_gains(sk);
+	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
+	return 3;
 }
 
-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
+/* Incorporate a new bw sample into the current window of our max filter. */
+static void bbr_take_max_bw_sample(struct sock *sk, u32 bw)
 {
 	struct bbr *bbr = inet_csk_ca(sk);
-	u32 bw;
-
-	bbr_update_model(sk, rs);
 
-	bw = bbr_bw(sk);
-	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
-	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
+	bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]);
 }
 
-__bpf_kfunc static void bbr_init(struct sock *sk)
+/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */
+static void bbr_advance_max_bw_filter(struct sock *sk)
 {
-	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
 
-	bbr->prior_cwnd = 0;
-	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
-	bbr->rtt_cnt = 0;
-	bbr->next_rtt_delivered = tp->delivered;
-	bbr->prev_ca_state = TCP_CA_Open;
-	bbr->packet_conservation = 0;
-
-	bbr->probe_rtt_done_stamp = 0;
-	bbr->probe_rtt_round_done = 0;
-	bbr->min_rtt_us = tcp_min_rtt(tp);
-	bbr->min_rtt_stamp = tcp_jiffies32;
-
-	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
+	if (!bbr->bw_hi[1])
+		return;  /* no samples in this window; remember old window */
+	bbr->bw_hi[0] = bbr->bw_hi[1];
+	bbr->bw_hi[1] = 0;
+}
 
-	bbr->has_seen_rtt = 0;
-	bbr_init_pacing_rate_from_rtt(sk);
+/* Reset the estimator for reaching full bandwidth based on bw plateau. */
+static void bbr_reset_full_bw(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
 
-	bbr->round_start = 0;
-	bbr->idle_restart = 0;
-	bbr->full_bw_reached = 0;
 	bbr->full_bw = 0;
 	bbr->full_bw_cnt = 0;
-	bbr->cycle_mstamp = 0;
-	bbr->cycle_idx = 0;
-	bbr_reset_lt_bw_sampling(sk);
-	bbr_reset_startup_mode(sk);
+	bbr->full_bw_now = 0;
+}
 
-	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
-	bbr->ack_epoch_acked = 0;
-	bbr->extra_acked_win_rtts = 0;
-	bbr->extra_acked_win_idx = 0;
-	bbr->extra_acked[0] = 0;
-	bbr->extra_acked[1] = 0;
+/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */
+static u32 bbr_target_inflight(struct sock *sk)
+{
+	u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT);
 
-	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
+	return min(bdp, tcp_sk(sk)->snd_cwnd);
 }
 
-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
+static bool bbr_is_probing_bandwidth(struct sock *sk)
 {
-	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
-	return 3;
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return (bbr->mode == BBR_STARTUP) ||
+		(bbr->mode == BBR_PROBE_BW &&
+		 (bbr->cycle_idx == BBR_BW_PROBE_REFILL ||
+		  bbr->cycle_idx == BBR_BW_PROBE_UP));
+}
+
+/* Has the given amount of time elapsed since we marked the phase start? */
+static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	const struct bbr *bbr = inet_csk_ca(sk);
+
+	return tcp_stamp_us_delta(tp->tcp_mstamp,
+				  bbr->cycle_mstamp + interval_us) > 0;
+}
+
+static void bbr_handle_queue_too_high_in_startup(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bdp;  /* estimated BDP in packets, with quantization budget */
+
+	bbr->full_bw_reached = 1;
+
+	bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+	bbr->inflight_hi = max(bdp, bbr->inflight_latest);
+}
+
+/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */
+static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible ||
+	    !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh))
+		return;
+
+	if (ce_ratio >= bbr_param(sk, ecn_thresh))
+		bbr->startup_ecn_rounds++;
+	else
+		bbr->startup_ecn_rounds = 0;
+
+	if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) {
+		bbr_handle_queue_too_high_in_startup(sk);
+		return;
+	}
+}
+
+/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */
+static int bbr_update_ecn_alpha(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	s32 delivered, delivered_ce;
+	u64 alpha, ce_ratio;
+	u32 gain;
+	bool want_ecn_alpha;
+
+	/* See if we should use ECN sender logic for this connection. */
+	if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) &&
+	    bbr_param(sk, ecn_factor) &&
+	    (bbr->min_rtt_us <= bbr_ecn_max_rtt_us ||
+	     !bbr_ecn_max_rtt_us))
+		bbr->ecn_eligible = 1;
+
+	/* Skip updating alpha only if not ECN-eligible and PLB is disabled. */
+	want_ecn_alpha = (bbr->ecn_eligible ||
+			  (bbr_can_use_ecn(sk) &&
+			   READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)));
+	if (!want_ecn_alpha)
+		return -1;
+
+	delivered = tp->delivered - bbr->alpha_last_delivered;
+	delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce;
+
+	if (delivered == 0 ||		/* avoid divide by zero */
+	    WARN_ON_ONCE(delivered < 0 || delivered_ce < 0))  /* backwards? */
+		return -1;
+
+	BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE);
+	ce_ratio = (u64)delivered_ce << BBR_SCALE;
+	do_div(ce_ratio, delivered);
+
+	gain = bbr_param(sk, ecn_alpha_gain);
+	alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE;
+	alpha += (gain * ce_ratio) >> BBR_SCALE;
+	bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT);
+
+	bbr->alpha_last_delivered = tp->delivered;
+	bbr->alpha_last_delivered_ce = tp->delivered_ce;
+
+	bbr_check_ecn_too_high_in_startup(sk, ce_ratio);
+	return (int)ce_ratio;
 }
 
-/* In theory BBR does not need to undo the cwnd since it does not
- * always reduce cwnd on losses (see bbr_main()). Keep it for now.
+/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6
+ * flow label) if it encounters sustained congestion in the form of ECN marks.
  */
-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
+static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->round_start && ce_ratio >= 0)
+		tcp_plb_update_state(sk, &bbr->plb, ce_ratio);
+
+	tcp_plb_check_rehash(sk, &bbr->plb);
+}
+
+/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */
+static void bbr_raise_inflight_hi_slope(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 growth_this_round, cnt;
+
+	/* Calculate "slope": packets S/Acked per inflight_hi increment. */
+	growth_this_round = 1 << bbr->bw_probe_up_rounds;
+	bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30);
+	cnt = tcp_snd_cwnd(tp) / growth_this_round;
+	cnt = max(cnt, 1U);
+	bbr->bw_probe_up_cnt = cnt;
+}
+
+/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */
+static void bbr_probe_inflight_hi_upward(struct sock *sk,
+					  const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 delta;
+
+	if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi)
+		return;  /* not fully using inflight_hi, so don't grow it */
+
+	/* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */
+	bbr->bw_probe_up_acks += rs->acked_sacked;
+	if (bbr->bw_probe_up_acks >=  bbr->bw_probe_up_cnt) {
+		delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt;
+		bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt;
+		bbr->inflight_hi += delta;
+		bbr->try_fast_path = 0;  /* Need to update cwnd */
+	}
+
+	if (bbr->round_start)
+		bbr_raise_inflight_hi_slope(sk);
+}
+
+/* Does loss/ECN rate for this sample say inflight is "too high"?
+ * This is used by both the bbr_check_loss_too_high_in_startup() function,
+ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which
+ * uses it to notice when loss/ECN rates suggest inflight is too high.
+ */
+static bool bbr_is_inflight_too_high(const struct sock *sk,
+				      const struct rate_sample *rs)
+{
+	const struct bbr *bbr = inet_csk_ca(sk);
+	u32 loss_thresh, ecn_thresh;
+
+	if (rs->lost > 0 && rs->tx_in_flight) {
+		loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >>
+				BBR_SCALE;
+		if (rs->lost > loss_thresh) {
+			return true;
+		}
+	}
+
+	if (rs->delivered_ce > 0 && rs->delivered > 0 &&
+	    bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) {
+		ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >>
+				BBR_SCALE;
+		if (rs->delivered_ce > ecn_thresh) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/* Calculate the tx_in_flight level that corresponded to excessive loss.
+ * We find "lost_prefix" segs of the skb where loss rate went too high,
+ * by solving for "lost_prefix" in the following equation:
+ *   lost                     /  inflight                     >= loss_thresh
+ *  (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh
+ * Then we take that equation, convert it to fixed point, and
+ * round up to the nearest packet.
+ */
+static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk,
+					  const struct rate_sample *rs,
+					  const struct sk_buff *skb)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	u32 loss_thresh  = bbr_param(sk, loss_thresh);
+	u32 pcount, divisor, inflight_hi;
+	s32 inflight_prev, lost_prev;
+	u64 loss_budget, lost_prefix;
+
+	pcount = tcp_skb_pcount(skb);
+
+	/* How much data was in flight before this skb? */
+	inflight_prev = rs->tx_in_flight - pcount;
+	if (inflight_prev < 0) {
+		WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
+				  pcount,
+				  TCP_SKB_CB(skb)->sacked,
+				  rs->tx_in_flight),
+			  "tx_in_flight: %u pcount: %u reneg: %u",
+			  rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg);
+		return ~0U;
+	}
+
+	/* How much inflight data was marked lost before this skb? */
+	lost_prev = rs->lost - pcount;
+	if (WARN_ONCE(lost_prev < 0,
+		      "cwnd: %u ca: %d out: %u lost: %u pif: %u "
+		      "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d "
+		      "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u",
+		      tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state,
+		      tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp),
+		      rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost,
+		      rs->lost, lost_prev, pcount,
+		      TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+		      tp->is_sack_reneg))
+		return ~0U;
+
+	/* At what prefix of this lost skb did losss rate exceed loss_thresh? */
+	loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1;
+	loss_budget >>= BBR_SCALE;
+	if (lost_prev >= loss_budget) {
+		lost_prefix = 0;   /* previous losses crossed loss_thresh */
+	} else {
+		lost_prefix = loss_budget - lost_prev;
+		lost_prefix <<= BBR_SCALE;
+		divisor = BBR_UNIT - loss_thresh;
+		if (WARN_ON_ONCE(!divisor))  /* loss_thresh is 8 bits */
+			return ~0U;
+		do_div(lost_prefix, divisor);
+	}
+
+	inflight_hi = inflight_prev + lost_prefix;
+	return inflight_hi;
+}
+
+/* If loss/ECN rates during probing indicated we may have overfilled a
+ * buffer, return an operating point that tries to leave unutilized headroom in
+ * the path for other flows, for fairness convergence and lower RTTs and loss.
+ */
+static u32 bbr_inflight_with_headroom(const struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 headroom, headroom_fraction;
+
+	if (bbr->inflight_hi == ~0U)
+		return ~0U;
+
+	headroom_fraction = bbr_param(sk, inflight_headroom);
+	headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE;
+	headroom = max(headroom, 1U);
+	return max_t(s32, bbr->inflight_hi - headroom,
+		     bbr_param(sk, cwnd_min_target));
+}
+
+/* Bound cwnd to a sensible level, based on our current probing state
+ * machine phase and model of a good inflight level (inflight_lo, inflight_hi).
+ */
+static void bbr_bound_cwnd_for_inflight_model(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 cap;
+
+	/* tcp_rcv_synsent_state_process() currently calls tcp_ack()
+	 * and thus cong_control() without first initializing us(!).
+	 */
+	if (!bbr->initialized)
+		return;
+
+	cap = ~0U;
+	if (bbr->mode == BBR_PROBE_BW &&
+	    bbr->cycle_idx != BBR_BW_PROBE_CRUISE) {
+		/* Probe to see if more packets fit in the path. */
+		cap = bbr->inflight_hi;
+	} else {
+		if (bbr->mode == BBR_PROBE_RTT ||
+		    (bbr->mode == BBR_PROBE_BW &&
+		     bbr->cycle_idx == BBR_BW_PROBE_CRUISE))
+			cap = bbr_inflight_with_headroom(sk);
+	}
+	/* Adapt to any loss/ECN since our last bw probe. */
+	cap = min(cap, bbr->inflight_lo);
+
+	cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target));
+	tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp)));
+}
+
+/* How should we multiplicatively cut bw or inflight limits based on ECN? */
+static u32 bbr_ecn_cut(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	return BBR_UNIT -
+		((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE);
+}
+
+/* Init lower bounds if have not inited yet. */
+static void bbr_init_lower_bounds(struct sock *sk, bool init_bw)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (init_bw && bbr->bw_lo == ~0U)
+		bbr->bw_lo = bbr_max_bw(sk);
+	if (bbr->inflight_lo == ~0U)
+		bbr->inflight_lo = tcp_snd_cwnd(tp);
+}
+
+/* Reduce bw and inflight to (1 - beta). */
+static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight)
+{
+	struct bbr* bbr = inet_csk_ca(sk);
+	u32 loss_cut = BBR_UNIT - bbr_param(sk, beta);
+
+	*bw = max_t(u32, bbr->bw_latest,
+		    (u64)bbr->bw_lo * loss_cut >> BBR_SCALE);
+	*inflight = max_t(u32, bbr->inflight_latest,
+			  (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE);
+}
+
+/* Reduce inflight to (1 - alpha*ecn_factor). */
+static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 ecn_cut = bbr_ecn_cut(sk);
+
+	*inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE;
+}
+
+/* Estimate a short-term lower bound on the capacity available now, based
+ * on measurements of the current delivery process and recent history. When we
+ * are seeing loss/ECN at times when we are not probing bw, then conservatively
+ * move toward flow balance by multiplicatively cutting our short-term
+ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a
+ * multiplicative decrease in order to converge to a lower capacity in time
+ * logarithmic in the magnitude of the decrease.
+ *
+ * However, we do not cut our short-term estimates lower than the current rate
+ * and volume of delivered data from this round trip, since from the current
+ * delivery process we can estimate the measured capacity available now.
+ *
+ * Anything faster than that approach would knowingly risk high loss, which can
+ * cause low bw for Reno/CUBIC and high loss recovery latency for
+ * request/response flows using any congestion control.
+ */
+static void bbr_adapt_lower_bounds(struct sock *sk,
+				    const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 ecn_inflight_lo = ~0U;
+
+	/* We only use lower-bound estimates when not probing bw.
+	 * When probing we need to push inflight higher to probe bw.
+	 */
+	if (bbr_is_probing_bandwidth(sk))
+		return;
+
+	/* ECN response. */
+	if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) {
+		bbr_init_lower_bounds(sk, false);
+		bbr_ecn_lower_bounds(sk, &ecn_inflight_lo);
+	}
+
+	/* Loss response. */
+	if (bbr->loss_in_round) {
+		bbr_init_lower_bounds(sk, true);
+		bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo);
+	}
+
+	/* Adjust to the lower of the levels implied by loss/ECN. */
+	bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo);
+	bbr->bw_lo = max(1U, bbr->bw_lo);
+}
+
+/* Reset any short-term lower-bound adaptation to congestion, so that we can
+ * push our inflight up.
+ */
+static void bbr_reset_lower_bounds(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->bw_lo = ~0U;
+	bbr->inflight_lo = ~0U;
+}
+
+/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state
+ * machine phase where we adapt our lower bound based on congestion signals.
+ */
+static void bbr_reset_congestion_signals(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->loss_in_round = 0;
+	bbr->ecn_in_round = 0;
+	bbr->loss_in_cycle = 0;
+	bbr->ecn_in_cycle = 0;
+	bbr->bw_latest = 0;
+	bbr->inflight_latest = 0;
+}
+
+static void bbr_exit_loss_recovery(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
+	bbr->try_fast_path = 0; /* bound cwnd using latest model */
+}
+
+/* Update rate and volume of delivered data from latest round trip. */
+static void bbr_update_latest_delivery_signals(
+	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->loss_round_start = 0;
+	if (rs->interval_us <= 0 || !rs->acked_sacked)
+		return; /* Not a valid observation */
+
+	bbr->bw_latest       = max_t(u32, bbr->bw_latest,       ctx->sample_bw);
+	bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered);
+
+	if (!before(rs->prior_delivered, bbr->loss_round_delivered)) {
+		bbr->loss_round_delivered = tp->delivered;
+		bbr->loss_round_start = 1;  /* mark start of new round trip */
+	}
+}
+
+/* Once per round, reset filter for latest rate and volume of delivered data. */
+static void bbr_advance_latest_delivery_signals(
+	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	/* If ACK matches a TLP retransmit, persist the filter. If we detect
+	 * that a TLP retransmit plugged a tail loss, we'll want to remember
+	 * how much data the path delivered before the tail loss.
+	 */
+	if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) {
+		bbr->bw_latest = ctx->sample_bw;
+		bbr->inflight_latest = rs->delivered;
+	}
+}
+
+/* Update (most of) our congestion signals: track the recent rate and volume of
+ * delivered data, presence of loss, and EWMA degree of ECN marking.
+ */
+static void bbr_update_congestion_signals(
+	struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx)
 {
 	struct bbr *bbr = inet_csk_ca(sk);
+	u64 bw;
+
+	if (rs->interval_us <= 0 || !rs->acked_sacked)
+		return; /* Not a valid observation */
+	bw = ctx->sample_bw;
 
-	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
+	if (!rs->is_app_limited || bw >= bbr_max_bw(sk))
+		bbr_take_max_bw_sample(sk, bw);
+
+	bbr->loss_in_round |= (rs->losses > 0);
+
+	if (!bbr->loss_round_start)
+		return;		/* skip the per-round-trip updates */
+	/* Now do per-round-trip updates. */
+	bbr_adapt_lower_bounds(sk, rs);
+
+	bbr->loss_in_round = 0;
+	bbr->ecn_in_round  = 0;
+}
+
+/* Bandwidth probing can cause loss. To help coexistence with loss-based
+ * congestion control we spread out our probing in a Reno-conscious way. Due to
+ * the shape of the Reno sawtooth, the time required between loss epochs for an
+ * idealized Reno flow is a number of round trips that is the BDP of that
+ * flow. We count packet-timed round trips directly, since measured RTT can
+ * vary widely, and Reno is driven by packet-timed round trips.
+ */
+static bool bbr_is_reno_coexistence_probe_time(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 rounds;
+
+	/* Random loss can shave some small percentage off of our inflight
+	 * in each round. To survive this, flows need robust periodic probes.
+	 */
+	rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk));
+	return bbr->rounds_since_probe >= rounds;
+}
+
+/* How long do we want to wait before probing for bandwidth (and risking
+ * loss)? We randomize the wait, for better mixing and fairness convergence.
+ *
+ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips.
+ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow,
+ * (eg 4K video to a broadband user):
+ *   BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets
+ *
+ * We bound the BBR-native inter-bw-probe wall clock time to be:
+ *  (a) higher than 2 sec: to try to avoid causing loss for a long enough time
+ *      to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must
+ *      be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs
+ *  (b) lower than 3 sec: to ensure flows can start probing in a reasonable
+ *      amount of time to discover unutilized bw on human-scale interactive
+ *      time-scales (e.g. perhaps traffic from a web page download that we
+ *      were competing with is now complete).
+ */
+static void bbr_pick_probe_wait(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	/* Decide the random round-trip bound for wait until probe: */
+	bbr->rounds_since_probe =
+		get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds));
+	/* Decide the random wall clock bound for wait until probe: */
+	bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) +
+			     get_random_u32_below(bbr_param(sk, bw_probe_rand_us));
+}
+
+static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->cycle_idx = cycle_idx;
+	/* New phase, so need to update cwnd and pacing rate. */
+	bbr->try_fast_path = 0;
+}
+
+/* Send at estimated bw to fill the pipe, but not queue. We need this phase
+ * before PROBE_UP, because as soon as we send faster than the available bw
+ * we will start building a queue, and if the buffer is shallow we can cause
+ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and
+ * inflight_hi estimates will underestimate.
+ */
+static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr_reset_lower_bounds(sk);
+	bbr->bw_probe_up_rounds = bw_probe_up_rounds;
+	bbr->bw_probe_up_acks = 0;
+	bbr->stopped_risky_probe = 0;
+	bbr->ack_phase = BBR_ACKS_REFILLING;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL);
+}
+
+/* Now probe max deliverable data rate and volume. */
+static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->ack_phase = BBR_ACKS_PROBE_STARTING;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr->cycle_mstamp = tp->tcp_mstamp;
+	bbr_reset_full_bw(sk);
+	bbr->full_bw = ctx->sample_bw;
+	bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP);
+	bbr_raise_inflight_hi_slope(sk);
+}
+
+/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall
+ * clock time at which to probe beyond an inflight that we think to be
+ * safe. This will knowingly risk packet loss, so we want to do this rarely, to
+ * keep packet loss rates low. Also start a round-trip counter, to probe faster
+ * if we estimate a Reno flow at our BDP would probe faster.
+ */
+static void bbr_start_bw_probe_down(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr_reset_congestion_signals(sk);
+	bbr->bw_probe_up_cnt = ~0U;     /* not growing inflight_hi any more */
+	bbr_pick_probe_wait(sk);
+	bbr->cycle_mstamp = tp->tcp_mstamp;		/* start wall clock */
+	bbr->ack_phase = BBR_ACKS_PROBE_STOPPING;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN);
+}
+
+/* Cruise: maintain what we estimate to be a neutral, conservative
+ * operating point, without attempting to probe up for bandwidth or down for
+ * RTT, and only reducing inflight in response to loss/ECN signals.
+ */
+static void bbr_start_bw_probe_cruise(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->inflight_lo != ~0U)
+		bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi);
+
+	bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE);
+}
+
+/* Loss and/or ECN rate is too high while probing.
+ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle.
+ */
+static void bbr_handle_inflight_too_high(struct sock *sk,
+					  const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	const u32 beta = bbr_param(sk, beta);
+
+	bbr->prev_probe_too_high = 1;
+	bbr->bw_probe_samples = 0;  /* only react once per probe */
+	/* If we are app-limited then we are not robustly
+	 * probing the max volume of inflight data we think
+	 * might be safe (analogous to how app-limited bw
+	 * samples are not known to be robustly probing bw).
+	 */
+	if (!rs->is_app_limited) {
+		bbr->inflight_hi = max_t(u32, rs->tx_in_flight,
+					 (u64)bbr_target_inflight(sk) *
+					 (BBR_UNIT - beta) >> BBR_SCALE);
+	}
+	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP)
+		bbr_start_bw_probe_down(sk);
+}
+
+/* If we're seeing bw and loss samples reflecting our bw probing, adapt
+ * using the signals we see. If loss or ECN mark rate gets too high, then adapt
+ * inflight_hi downward. If we're able to push inflight higher without such
+ * signals, push higher: adapt inflight_hi upward.
+ */
+static bool bbr_adapt_upper_bounds(struct sock *sk,
+				    const struct rate_sample *rs,
+				    struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	/* Track when we'll see bw/loss samples resulting from our bw probes. */
+	if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start)
+		bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK;
+	if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) {
+		/* End of samples from bw probing phase. */
+		bbr->bw_probe_samples = 0;
+		bbr->ack_phase = BBR_ACKS_INIT;
+		/* At this point in the cycle, our current bw sample is also
+		 * our best recent chance at finding the highest available bw
+		 * for this flow. So now is the best time to forget the bw
+		 * samples from the previous cycle, by advancing the window.
+		 */
+		if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited)
+			bbr_advance_max_bw_filter(sk);
+		/* If we had an inflight_hi, then probed and pushed inflight all
+		 * the way up to hit that inflight_hi without seeing any
+		 * high loss/ECN in all the resulting ACKs from that probing,
+		 * then probe up again, this time letting inflight persist at
+		 * inflight_hi for a round trip, then accelerating beyond.
+		 */
+		if (bbr->mode == BBR_PROBE_BW &&
+		    bbr->stopped_risky_probe && !bbr->prev_probe_too_high) {
+			bbr_start_bw_probe_refill(sk, 0);
+			return true;  /* yes, decided state transition */
+		}
+	}
+	if (bbr_is_inflight_too_high(sk, rs)) {
+		if (bbr->bw_probe_samples)  /*  sample is from bw probing? */
+			bbr_handle_inflight_too_high(sk, rs);
+	} else {
+		/* Loss/ECN rate is declared safe. Adjust upper bound upward. */
+
+		if (bbr->inflight_hi == ~0U)
+			return false;   /* no excess queue signals yet */
+
+		/* To be resilient to random loss, we must raise bw/inflight_hi
+		 * if we observe in any phase that a higher level is safe.
+		 */
+		if (rs->tx_in_flight > bbr->inflight_hi) {
+			bbr->inflight_hi = rs->tx_in_flight;
+		}
+
+		if (bbr->mode == BBR_PROBE_BW &&
+		    bbr->cycle_idx == BBR_BW_PROBE_UP)
+			bbr_probe_inflight_hi_upward(sk, rs);
+	}
+
+	return false;
+}
+
+/* Check if it's time to probe for bandwidth now, and if so, kick it off. */
+static bool bbr_check_time_to_probe_bw(struct sock *sk,
+					const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 n;
+
+	/* If we seem to be at an operating point where we are not seeing loss
+	 * but we are seeing ECN marks, then when the ECN marks cease we reprobe
+	 * quickly (in case cross-traffic has ceased and freed up bw).
+	 */
+	if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible &&
+	    bbr->ecn_in_cycle && !bbr->loss_in_cycle &&
+	    inet_csk(sk)->icsk_ca_state == TCP_CA_Open) {
+		/* Calculate n so that when bbr_raise_inflight_hi_slope()
+		 * computes growth_this_round as 2^n it will be roughly the
+		 * desired volume of data (inflight_hi*ecn_reprobe_gain).
+		 */
+		n = ilog2((((u64)bbr->inflight_hi *
+			    bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE));
+		bbr_start_bw_probe_refill(sk, n);
+		return true;
+	}
+
+	if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) ||
+	    bbr_is_reno_coexistence_probe_time(sk)) {
+		bbr_start_bw_probe_refill(sk, 0);
+		return true;
+	}
+	return false;
+}
+
+/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */
+static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw)
+{
+	/* Always need to pull inflight down to leave headroom in queue. */
+	if (inflight > bbr_inflight_with_headroom(sk))
+		return false;
+
+	return inflight <= bbr_inflight(sk, bw, BBR_UNIT);
+}
+
+/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */
+static void bbr_update_cycle_phase(struct sock *sk,
+				    const struct rate_sample *rs,
+				    struct bbr_context *ctx)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	bool is_bw_probe_done = false;
+	u32 inflight, bw;
+
+	if (!bbr_full_bw_reached(sk))
+		return;
+
+	/* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */
+	if (bbr_adapt_upper_bounds(sk, rs, ctx))
+		return;		/* already decided state transition */
+
+	if (bbr->mode != BBR_PROBE_BW)
+		return;
+
+	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
+	bw = bbr_max_bw(sk);
+
+	switch (bbr->cycle_idx) {
+	/* First we spend most of our time cruising with a pacing_gain of 1.0,
+	 * which paces at the estimated bw, to try to fully use the pipe
+	 * without building queue. If we encounter loss/ECN marks, we adapt
+	 * by slowing down.
+	 */
+	case BBR_BW_PROBE_CRUISE:
+		if (bbr_check_time_to_probe_bw(sk, rs))
+			return;		/* already decided state transition */
+		break;
+
+	/* After cruising, when it's time to probe, we first "refill": we send
+	 * at the estimated bw to fill the pipe, before probing higher and
+	 * knowingly risking overflowing the bottleneck buffer (causing loss).
+	 */
+	case BBR_BW_PROBE_REFILL:
+		if (bbr->round_start) {
+			/* After one full round trip of sending in REFILL, we
+			 * start to see bw samples reflecting our REFILL, which
+			 * may be putting too much data in flight.
+			 */
+			bbr->bw_probe_samples = 1;
+			bbr_start_bw_probe_up(sk, ctx);
+		}
+		break;
+
+	/* After we refill the pipe, we probe by using a pacing_gain > 1.0, to
+	 * probe for bw. If we have not seen loss/ECN, we try to raise inflight
+	 * to at least pacing_gain*BDP; note that this may take more than
+	 * min_rtt if min_rtt is small (e.g. on a LAN).
+	 *
+	 * We terminate PROBE_UP bandwidth probing upon any of the following:
+	 *
+	 * (1) We've pushed inflight up to hit the inflight_hi target set in the
+	 *     most recent previous bw probe phase. Thus we want to start
+	 *     draining the queue immediately because it's very likely the most
+	 *     recently sent packets will fill the queue and cause drops.
+	 * (2) If inflight_hi has not limited bandwidth growth recently, and
+	 *     yet delivered bandwidth has not increased much recently
+	 *     (bbr->full_bw_now).
+	 * (3) Loss filter says loss rate is "too high".
+	 * (4) ECN filter says ECN mark rate is "too high".
+	 *
+	 * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high()
+	 */
+	case BBR_BW_PROBE_UP:
+		if (bbr->prev_probe_too_high &&
+		    inflight >= bbr->inflight_hi) {
+			bbr->stopped_risky_probe = 1;
+			is_bw_probe_done = true;
+		} else {
+			if (tp->is_cwnd_limited &&
+			    tcp_snd_cwnd(tp) >= bbr->inflight_hi) {
+				/* inflight_hi is limiting bw growth */
+				bbr_reset_full_bw(sk);
+				bbr->full_bw = ctx->sample_bw;
+			} else if (bbr->full_bw_now) {
+				/* Plateau in estimated bw. Pipe looks full. */
+				is_bw_probe_done = true;
+			}
+		}
+		if (is_bw_probe_done) {
+			bbr->prev_probe_too_high = 0;  /* no loss/ECN (yet) */
+			bbr_start_bw_probe_down(sk);  /* restart w/ down */
+		}
+		break;
+
+	/* After probing in PROBE_UP, we have usually accumulated some data in
+	 * the bottleneck buffer (if bw probing didn't find more bw). We next
+	 * enter PROBE_DOWN to try to drain any excess data from the queue. To
+	 * do this, we use a pacing_gain < 1.0. We hold this pacing gain until
+	 * our inflight is less then that target cruising point, which is the
+	 * minimum of (a) the amount needed to leave headroom, and (b) the
+	 * estimated BDP. Once inflight falls to match the target, we estimate
+	 * the queue is drained; persisting would underutilize the pipe.
+	 */
+	case BBR_BW_PROBE_DOWN:
+		if (bbr_check_time_to_probe_bw(sk, rs))
+			return;		/* already decided state transition */
+		if (bbr_check_time_to_cruise(sk, inflight, bw))
+			bbr_start_bw_probe_cruise(sk);
+		break;
+
+	default:
+		WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx);
+	}
+}
+
+/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */
+static void bbr_exit_probe_rtt(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr_reset_lower_bounds(sk);
+	if (bbr_full_bw_reached(sk)) {
+		bbr->mode = BBR_PROBE_BW;
+		/* Raising inflight after PROBE_RTT may cause loss, so reset
+		 * the PROBE_BW clock and schedule the next bandwidth probe for
+		 * a friendly and randomized future point in time.
+		 */
+		bbr_start_bw_probe_down(sk);
+		/* Since we are exiting PROBE_RTT, we know inflight is
+		 * below our estimated BDP, so it is reasonable to cruise.
+		 */
+		bbr_start_bw_probe_cruise(sk);
+	} else {
+		bbr->mode = BBR_STARTUP;
+	}
+}
+
+/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until
+ * the end of the round in recovery to get a good estimate of how many packets
+ * have been lost, and how many we need to drain with a low pacing rate.
+ */
+static void bbr_check_loss_too_high_in_startup(struct sock *sk,
+						const struct rate_sample *rs)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr_full_bw_reached(sk))
+		return;
+
+	/* For STARTUP exit, check the loss rate at the end of each round trip
+	 * of Recovery episodes in STARTUP. We check the loss rate at the end
+	 * of the round trip to filter out noisy/low loss and have a better
+	 * sense of inflight (extent of loss), so we can drain more accurately.
+	 */
+	if (rs->losses && bbr->loss_events_in_round < 0xf)
+		bbr->loss_events_in_round++;  /* update saturating counter */
+	if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start &&
+	    inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery &&
+	    bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) &&
+	    bbr_is_inflight_too_high(sk, rs)) {
+		bbr_handle_queue_too_high_in_startup(sk);
+		return;
+	}
+	if (bbr->loss_round_start)
+		bbr->loss_events_in_round = 0;
+}
+
+/* Estimate when the pipe is full, using the change in delivery rate: BBR
+ * estimates bw probing filled the pipe if the estimated bw hasn't changed by
+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
+ * higher rwin, 3: we get higher delivery rate samples. Or transient
+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
+ */
+static void bbr_check_full_bw_reached(struct sock *sk,
+				       const struct rate_sample *rs,
+				       struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 bw_thresh, full_cnt, thresh;
+
+	if (bbr->full_bw_now || rs->is_app_limited)
+		return;
+
+	thresh = bbr_param(sk, full_bw_thresh);
+	full_cnt = bbr_param(sk, full_bw_cnt);
+	bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE;
+	if (ctx->sample_bw >= bw_thresh) {
+		bbr_reset_full_bw(sk);
+		bbr->full_bw = ctx->sample_bw;
+		return;
+	}
+	if (!bbr->round_start)
+		return;
+	++bbr->full_bw_cnt;
+	bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt;
+	bbr->full_bw_reached |= bbr->full_bw_now;
+}
+
+/* If pipe is probably full, drain the queue and then enter steady-state. */
+static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs,
+			    struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
+		bbr->mode = BBR_DRAIN;	/* drain queue we created */
+		/* Set ssthresh to export purely for monitoring, to signal
+		 * completion of initial STARTUP by setting to a non-
+		 * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR).
+		 */
+		tcp_sk(sk)->snd_ssthresh =
+				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
+		bbr_reset_congestion_signals(sk);
+	}	/* fall through to check if in-flight is already small: */
+	if (bbr->mode == BBR_DRAIN &&
+	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) {
+		bbr->mode = BBR_PROBE_BW;
+		bbr_start_bw_probe_down(sk);
+	}
+}
+
+static void bbr_update_model(struct sock *sk, const struct rate_sample *rs,
+			      struct bbr_context *ctx)
+{
+	bbr_update_congestion_signals(sk, rs, ctx);
+	bbr_update_ack_aggregation(sk, rs);
+	bbr_check_loss_too_high_in_startup(sk, rs);
+	bbr_check_full_bw_reached(sk, rs, ctx);
+	bbr_check_drain(sk, rs, ctx);
+	bbr_update_cycle_phase(sk, rs, ctx);
+	bbr_update_min_rtt(sk, rs);
+}
+
+/* Fast path for app-limited case.
+ *
+ * On each ack, we execute bbr state machine, which primarily consists of:
+ * 1) update model based on new rate sample, and
+ * 2) update control based on updated model or state change.
+ *
+ * There are certain workload/scenarios, e.g. app-limited case, where
+ * either we can skip updating model or we can skip update of both model
+ * as well as control. This provides signifcant softirq cpu savings for
+ * processing incoming acks.
+ *
+ * In case of app-limited, if there is no congestion (loss/ecn) and
+ * if observed bw sample is less than current estimated bw, then we can
+ * skip some of the computation in bbr state processing:
+ *
+ * - if there is no rtt/mode/phase change: In this case, since all the
+ *   parameters of the network model are constant, we can skip model
+ *   as well control update.
+ *
+ * - else we can skip rest of the model update. But we still need to
+ *   update the control to account for the new rtt/mode/phase.
+ *
+ * Returns whether we can take fast path or not.
+ */
+static bool bbr_run_fast_path(struct sock *sk, bool *update_model,
+		const struct rate_sample *rs, struct bbr_context *ctx)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+	u32 prev_min_rtt_us, prev_mode;
+
+	if (bbr_param(sk, fast_path) && bbr->try_fast_path &&
+	    rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) &&
+	    !bbr->loss_in_round && !bbr->ecn_in_round ) {
+		prev_mode = bbr->mode;
+		prev_min_rtt_us = bbr->min_rtt_us;
+		bbr_check_drain(sk, rs, ctx);
+		bbr_update_cycle_phase(sk, rs, ctx);
+		bbr_update_min_rtt(sk, rs);
+
+		if (bbr->mode == prev_mode &&
+		    bbr->min_rtt_us == prev_min_rtt_us &&
+		    bbr->try_fast_path) {
+			return true;
+		}
+
+		/* Skip model update, but control still needs to be updated */
+		*update_model = false;
+	}
+	return false;
+}
+
+__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	struct bbr_context ctx = { 0 };
+	bool update_model = true;
+	u32 bw, round_delivered;
+	int ce_ratio = -1;
+
+	round_delivered = bbr_update_round_start(sk, rs, &ctx);
+	if (bbr->round_start) {
+		bbr->rounds_since_probe =
+			min_t(s32, bbr->rounds_since_probe + 1, 0xFF);
+		ce_ratio = bbr_update_ecn_alpha(sk);
+	}
+	bbr_plb(sk, rs, ce_ratio);
+
+	bbr->ecn_in_round  |= (bbr->ecn_eligible && rs->is_ece);
+	bbr_calculate_bw_sample(sk, rs, &ctx);
+	bbr_update_latest_delivery_signals(sk, rs, &ctx);
+
+	if (bbr_run_fast_path(sk, &update_model, rs, &ctx))
+		goto out;
+
+	if (update_model)
+		bbr_update_model(sk, rs, &ctx);
+
+	bbr_update_gains(sk);
+	bw = bbr_bw(sk);
+	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
+	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain,
+		     tcp_snd_cwnd(tp), &ctx);
+	bbr_bound_cwnd_for_inflight_model(sk);
+
+out:
+	bbr_advance_latest_delivery_signals(sk, rs, &ctx);
+	bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state;
+	bbr->loss_in_cycle |= rs->lost > 0;
+	bbr->ecn_in_cycle  |= rs->delivered_ce > 0;
+}
+
+__bpf_kfunc static void bbr_init(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr->initialized = 1;
+
+	bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp));
+	bbr->prior_cwnd = tp->prior_cwnd;
+	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+	bbr->next_rtt_delivered = tp->delivered;
+	bbr->prev_ca_state = TCP_CA_Open;
+
+	bbr->probe_rtt_done_stamp = 0;
+	bbr->probe_rtt_round_done = 0;
+	bbr->probe_rtt_min_us = tcp_min_rtt(tp);
+	bbr->probe_rtt_min_stamp = tcp_jiffies32;
+	bbr->min_rtt_us = tcp_min_rtt(tp);
+	bbr->min_rtt_stamp = tcp_jiffies32;
+
+	bbr->has_seen_rtt = 0;
+	bbr_init_pacing_rate_from_rtt(sk);
+
+	bbr->round_start = 0;
+	bbr->idle_restart = 0;
+	bbr->full_bw_reached = 0;
+	bbr->full_bw = 0;
 	bbr->full_bw_cnt = 0;
-	bbr_reset_lt_bw_sampling(sk);
-	return tcp_snd_cwnd(tcp_sk(sk));
+	bbr->cycle_mstamp = 0;
+	bbr->cycle_idx = 0;
+
+	bbr_reset_startup_mode(sk);
+
+	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+	bbr->ack_epoch_acked = 0;
+	bbr->extra_acked_win_rtts = 0;
+	bbr->extra_acked_win_idx = 0;
+	bbr->extra_acked[0] = 0;
+	bbr->extra_acked[1] = 0;
+
+	bbr->ce_state = 0;
+	bbr->prior_rcv_nxt = tp->rcv_nxt;
+	bbr->try_fast_path = 0;
+
+	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
+
+	/* Start sampling ECN mark rate after first full flight is ACKed: */
+	bbr->loss_round_delivered = tp->delivered + 1;
+	bbr->loss_round_start = 0;
+	bbr->undo_bw_lo = 0;
+	bbr->undo_inflight_lo = 0;
+	bbr->undo_inflight_hi = 0;
+	bbr->loss_events_in_round = 0;
+	bbr->startup_ecn_rounds = 0;
+	bbr_reset_congestion_signals(sk);
+	bbr->bw_lo = ~0U;
+	bbr->bw_hi[0] = 0;
+	bbr->bw_hi[1] = 0;
+	bbr->inflight_lo = ~0U;
+	bbr->inflight_hi = ~0U;
+	bbr_reset_full_bw(sk);
+	bbr->bw_probe_up_cnt = ~0U;
+	bbr->bw_probe_up_acks = 0;
+	bbr->bw_probe_up_rounds = 0;
+	bbr->probe_wait_us = 0;
+	bbr->stopped_risky_probe = 0;
+	bbr->ack_phase = BBR_ACKS_INIT;
+	bbr->rounds_since_probe = 0;
+	bbr->bw_probe_samples = 0;
+	bbr->prev_probe_too_high = 0;
+	bbr->ecn_eligible = 0;
+	bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init);
+	bbr->alpha_last_delivered = 0;
+	bbr->alpha_last_delivered_ce = 0;
+	bbr->plb.pause_until = 0;
+
+	tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0;
+
+	if (bbr_can_use_ecn(sk))
+		tp->ecn_flags |= TCP_ECN_ECT_PERMANENT;
+}
+
+/* BBR marks the current round trip as a loss round. */
+static void bbr_note_loss(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	/* Capture "current" data over the full round trip of loss, to
+	 * have a better chance of observing the full capacity of the path.
+	 */
+	if (!bbr->loss_in_round)  /* first loss in this round trip? */
+		bbr->loss_round_delivered = tp->delivered;  /* set round trip */
+	bbr->loss_in_round = 1;
+	bbr->loss_in_cycle = 1;
 }
 
-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
+/* Core TCP stack informs us that the given skb was just marked lost. */
+__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk,
+					    const struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+	struct rate_sample rs = {};
+
+	bbr_note_loss(sk);
+
+	if (!bbr->bw_probe_samples)
+		return;  /* not an skb sent while probing for bandwidth */
+	if (unlikely(!scb->tx.delivered_mstamp))
+		return;  /* skb was SACKed, reneged, marked lost; ignore it */
+	/* We are probing for bandwidth. Construct a rate sample that
+	 * estimates what happened in the flight leading up to this lost skb,
+	 * then see if the loss rate went too high, and if so at which packet.
+	 */
+	rs.tx_in_flight = scb->tx.in_flight;
+	rs.lost = tp->lost - scb->tx.lost;
+	rs.is_app_limited = scb->tx.is_app_limited;
+	if (bbr_is_inflight_too_high(sk, &rs)) {
+		rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb);
+		bbr_handle_inflight_too_high(sk, &rs);
+	}
+}
+
+static void bbr_run_loss_probe_recovery(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct bbr *bbr = inet_csk_ca(sk);
+	struct rate_sample rs = {0};
+
+	bbr_note_loss(sk);
+
+	if (!bbr->bw_probe_samples)
+		return;  /* not sent while probing for bandwidth */
+	/* We are probing for bandwidth. Construct a rate sample that
+	 * estimates what happened in the flight leading up to this
+	 * loss, then see if the loss rate went too high.
+	 */
+	rs.lost = 1;	/* TLP probe repaired loss of a single segment */
+	rs.tx_in_flight = bbr->inflight_latest + rs.lost;
+	rs.is_app_limited = tp->tlp_orig_data_app_limited;
+	if (bbr_is_inflight_too_high(sk, &rs))
+		bbr_handle_inflight_too_high(sk, &rs);
+}
+
+/* Revert short-term model if current loss recovery event was spurious. */
+__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
+{
+	struct bbr *bbr = inet_csk_ca(sk);
+
+	bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */
+	bbr->loss_in_round = 0;
+
+	/* Revert to cwnd and other state saved before loss episode. */
+	bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo);
+	bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo);
+	bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi);
+	bbr->try_fast_path = 0;  /* take slow path to set proper cwnd, pacing */
+	return bbr->prior_cwnd;
+}
+
+/* Entering loss recovery, so save state for when we undo recovery. */
 __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
 {
+	struct bbr *bbr = inet_csk_ca(sk);
+
 	bbr_save_cwnd(sk);
+	/* For undo, save state that adapts based on loss signal. */
+	bbr->undo_bw_lo		= bbr->bw_lo;
+	bbr->undo_inflight_lo	= bbr->inflight_lo;
+	bbr->undo_inflight_hi	= bbr->inflight_hi;
 	return tcp_sk(sk)->snd_ssthresh;
 }
 
+static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr)
+{
+	switch (bbr->mode) {
+	case BBR_STARTUP:
+		return BBR_PHASE_STARTUP;
+	case BBR_DRAIN:
+		return BBR_PHASE_DRAIN;
+	case BBR_PROBE_BW:
+		break;
+	case BBR_PROBE_RTT:
+		return BBR_PHASE_PROBE_RTT;
+	default:
+		return BBR_PHASE_INVALID;
+	}
+	switch (bbr->cycle_idx) {
+	case BBR_BW_PROBE_UP:
+		return BBR_PHASE_PROBE_BW_UP;
+	case BBR_BW_PROBE_DOWN:
+		return BBR_PHASE_PROBE_BW_DOWN;
+	case BBR_BW_PROBE_CRUISE:
+		return BBR_PHASE_PROBE_BW_CRUISE;
+	case BBR_BW_PROBE_REFILL:
+		return BBR_PHASE_PROBE_BW_REFILL;
+	default:
+		return BBR_PHASE_INVALID;
+	}
+}
+
 static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
-			   union tcp_cc_info *info)
+			    union tcp_cc_info *info)
 {
 	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
 	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
-		struct tcp_sock *tp = tcp_sk(sk);
 		struct bbr *bbr = inet_csk_ca(sk);
-		u64 bw = bbr_bw(sk);
-
-		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
-		memset(&info->bbr, 0, sizeof(info->bbr));
-		info->bbr.bbr_bw_lo		= (u32)bw;
-		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
-		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
-		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
-		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
+		u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk));
+		u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk));
+		u64 bw_lo = bbr->bw_lo == ~0U ?
+			~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo);
+		struct tcp_bbr_info *bbr_info = &info->bbr;
+
+		memset(bbr_info, 0, sizeof(*bbr_info));
+		bbr_info->bbr_bw_lo		= (u32)bw;
+		bbr_info->bbr_bw_hi		= (u32)(bw >> 32);
+		bbr_info->bbr_min_rtt		= bbr->min_rtt_us;
+		bbr_info->bbr_pacing_gain	= bbr->pacing_gain;
+		bbr_info->bbr_cwnd_gain		= bbr->cwnd_gain;
+		bbr_info->bbr_bw_hi_lsb		= (u32)bw_hi;
+		bbr_info->bbr_bw_hi_msb		= (u32)(bw_hi >> 32);
+		bbr_info->bbr_bw_lo_lsb		= (u32)bw_lo;
+		bbr_info->bbr_bw_lo_msb		= (u32)(bw_lo >> 32);
+		bbr_info->bbr_mode		= bbr->mode;
+		bbr_info->bbr_phase		= (__u8)bbr_get_phase(bbr);
+		bbr_info->bbr_version		= (__u8)BBR_VERSION;
+		bbr_info->bbr_inflight_lo	= bbr->inflight_lo;
+		bbr_info->bbr_inflight_hi	= bbr->inflight_hi;
+		bbr_info->bbr_extra_acked	= bbr_extra_acked(sk);
 		*attr = INET_DIAG_BBRINFO;
-		return sizeof(info->bbr);
+		return sizeof(*bbr_info);
 	}
 	return 0;
 }
 
 __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	struct bbr *bbr = inet_csk_ca(sk);
 
 	if (new_state == TCP_CA_Loss) {
-		struct rate_sample rs = { .losses = 1 };
 
 		bbr->prev_ca_state = TCP_CA_Loss;
-		bbr->full_bw = 0;
-		bbr->round_start = 1;	/* treat RTO like end of a round */
-		bbr_lt_bw_sampling(sk, &rs);
+		tcp_plb_update_state_upon_rto(sk, &bbr->plb);
+		/* The tcp_write_timeout() call to sk_rethink_txhash() likely
+		 * repathed this flow, so re-learn the min network RTT on the
+		 * new path:
+		 */
+		bbr_reset_full_bw(sk);
+		if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) {
+			/* bbr_adapt_lower_bounds() needs cwnd before
+			 * we suffered an RTO, to update inflight_lo:
+			 */
+			bbr->inflight_lo =
+				max(tcp_snd_cwnd(tp), bbr->prior_cwnd);
+		}
+	} else if (bbr->prev_ca_state == TCP_CA_Loss &&
+		   new_state != TCP_CA_Loss) {
+		bbr_exit_loss_recovery(sk);
 	}
 }
 
+
 static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
-	.flags		= TCP_CONG_NON_RESTRICTED,
+	.flags		= TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS,
 	.name		= "bbr",
 	.owner		= THIS_MODULE,
 	.init		= bbr_init,
 	.cong_control	= bbr_main,
 	.sndbuf_expand	= bbr_sndbuf_expand,
+	.skb_marked_lost = bbr_skb_marked_lost,
 	.undo_cwnd	= bbr_undo_cwnd,
 	.cwnd_event	= bbr_cwnd_event,
 	.ssthresh	= bbr_ssthresh,
-	.min_tso_segs	= bbr_min_tso_segs,
+	.tso_segs	= bbr_tso_segs,
 	.get_info	= bbr_get_info,
 	.set_state	= bbr_set_state,
 };
@@ -1159,10 +2359,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids)
 BTF_ID_FLAGS(func, bbr_init)
 BTF_ID_FLAGS(func, bbr_main)
 BTF_ID_FLAGS(func, bbr_sndbuf_expand)
+BTF_ID_FLAGS(func, bbr_skb_marked_lost)
 BTF_ID_FLAGS(func, bbr_undo_cwnd)
 BTF_ID_FLAGS(func, bbr_cwnd_event)
 BTF_ID_FLAGS(func, bbr_ssthresh)
-BTF_ID_FLAGS(func, bbr_min_tso_segs)
+BTF_ID_FLAGS(func, bbr_tso_segs)
 BTF_ID_FLAGS(func, bbr_set_state)
 BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
 
@@ -1195,5 +2396,12 @@ MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
 MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
 MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
 MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
+MODULE_AUTHOR("Priyaranjan Jha <priyarjha@google.com>");
+MODULE_AUTHOR("Yousuk Seung <ysseung@google.com>");
+MODULE_AUTHOR("Kevin Yang <yyd@google.com>");
+MODULE_AUTHOR("Arjun Roy <arjunroy@google.com>");
+MODULE_AUTHOR("David Morley <morleyd@google.com>");
+
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
+MODULE_VERSION(__stringify(BBR_VERSION));
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 48617d99abb0..d8008ac10ef0 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -236,6 +236,7 @@ void tcp_init_congestion_control(struct sock *sk)
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
 	tcp_sk(sk)->prior_ssthresh = 0;
+	tcp_sk(sk)->fast_ack_mode = 0;
 	if (icsk->icsk_ca_ops->init)
 		icsk->icsk_ca_ops->init(sk);
 	if (tcp_ca_needs_ecn(sk))
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f130eccade39..3ccb30894a12 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -370,7 +370,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
 			tcp_enter_quickack_mode(sk, 2);
 		break;
 	case INET_ECN_CE:
-		if (tcp_ca_needs_ecn(sk))
+		if (tcp_ca_wants_ce_events(sk))
 			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
 
 		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
@@ -381,7 +381,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
 		tp->ecn_flags |= TCP_ECN_SEEN;
 		break;
 	default:
-		if (tcp_ca_needs_ecn(sk))
+		if (tcp_ca_wants_ce_events(sk))
 			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
 		tp->ecn_flags |= TCP_ECN_SEEN;
 		break;
@@ -1120,7 +1120,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
  */
 static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
 {
+	struct sock *sk = (struct sock *)tp;
+	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+
 	tp->lost += tcp_skb_pcount(skb);
+	if (ca_ops->skb_marked_lost)
+		ca_ops->skb_marked_lost(sk, skb);
 }
 
 void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
@@ -1501,6 +1506,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
 	WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
 	tcp_skb_pcount_add(skb, -pcount);
 
+	/* Adjust tx.in_flight as pcount is shifted from skb to prev. */
+	if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount,
+		      "prev in_flight: %u skb in_flight: %u pcount: %u",
+		      TCP_SKB_CB(prev)->tx.in_flight,
+		      TCP_SKB_CB(skb)->tx.in_flight,
+		      pcount))
+		TCP_SKB_CB(skb)->tx.in_flight = 0;
+	else
+		TCP_SKB_CB(skb)->tx.in_flight -= pcount;
+	TCP_SKB_CB(prev)->tx.in_flight += pcount;
+
 	/* When we're adding to gso_segs == 1, gso_size will be zero,
 	 * in theory this shouldn't be necessary but as long as DSACK
 	 * code can come after this skb later on it's better to keep
@@ -3795,7 +3811,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
 /* This routine deals with acks during a TLP episode and ends an episode by
  * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack
  */
-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag,
+				struct rate_sample *rs)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -3812,6 +3829,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
 		/* ACK advances: there was a loss, so reduce cwnd. Reset
 		 * tlp_high_seq in tcp_init_cwnd_reduction()
 		 */
+		tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY);
 		tcp_init_cwnd_reduction(sk);
 		tcp_set_ca_state(sk, TCP_CA_CWR);
 		tcp_end_cwnd_reduction(sk);
@@ -3822,6 +3840,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
 			     FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
 		/* Pure dupack: original and TLP probe arrived; no loss */
 		tp->tlp_high_seq = 0;
+	} else {
+		/* This ACK matches a TLP retransmit. We cannot yet tell if
+		 * this ACK is for the original or the TLP retransmit.
+		 */
+		rs->is_acking_tlp_retrans_seq = 1;
 	}
 }
 
@@ -3930,6 +3953,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una;
 	rs.prior_in_flight = tcp_packets_in_flight(tp);
+	tcp_rate_check_app_limited(sk);
 
 	/* ts_recent update must be made after we are sure that the packet
 	 * is in window.
@@ -4004,7 +4028,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	tcp_rack_update_reo_wnd(sk, &rs);
 
 	if (tp->tlp_high_seq)
-		tcp_process_tlp_ack(sk, ack, flag);
+		tcp_process_tlp_ack(sk, ack, flag, &rs);
 
 	if (tcp_ack_is_dubious(sk, flag)) {
 		if (!(flag & (FLAG_SND_UNA_ADVANCED |
@@ -4028,6 +4052,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	delivered = tcp_newly_delivered(sk, delivered, flag);
 	lost = tp->lost - lost;			/* freshly marked lost */
 	rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
+	rs.is_ece = !!(flag & FLAG_ECE);
 	tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
 	tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
 	tcp_xmit_recovery(sk, rexmit);
@@ -4047,7 +4072,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	tcp_ack_probe(sk);
 
 	if (tp->tlp_high_seq)
-		tcp_process_tlp_ack(sk, ack, flag);
+		tcp_process_tlp_ack(sk, ack, flag, &rs);
 	return 1;
 
 old_ack:
@@ -5719,13 +5744,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
 
 	    /* More than one full frame received... */
 	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
+	     (tp->fast_ack_mode == 1 ||
 	     /* ... and right edge of window advances far enough.
 	      * (tcp_recvmsg() will send ACK otherwise).
 	      * If application uses SO_RCVLOWAT, we want send ack now if
 	      * we have not received enough bytes to satisfy the condition.
 	      */
-	    (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
-	     __tcp_select_window(sk) >= tp->rcv_wnd)) ||
+	      (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
+	       __tcp_select_window(sk) >= tp->rcv_wnd))) ||
 	    /* We ACK each frame or... */
 	    tcp_in_quickack_mode(sk) ||
 	    /* Protocol state mandates a one-time immediate ACK */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index d5da3ec8f846..7b25413ad225 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -465,6 +465,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
 	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
 	bool ca_got_dst = false;
 
+	tcp_set_ecn_low_from_dst(sk, dst);
+
 	if (ca_key != TCP_CA_UNSPEC) {
 		const struct tcp_congestion_ops *ca;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 95618d0e78e4..3f4bdd2b6476 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -336,10 +336,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
 	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
 	bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
 		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
+	const struct dst_entry *dst = __sk_dst_get(sk);
 
 	if (!use_ecn) {
-		const struct dst_entry *dst = __sk_dst_get(sk);
-
 		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
 			use_ecn = true;
 	}
@@ -351,6 +350,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
 		tp->ecn_flags = TCP_ECN_OK;
 		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
 			INET_ECN_xmit(sk);
+
+		if (dst)
+			tcp_set_ecn_low_from_dst(sk, dst);
 	}
 }
 
@@ -388,7 +390,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
 				th->cwr = 1;
 				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
 			}
-		} else if (!tcp_ca_needs_ecn(sk)) {
+		} else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) &&
+			!tcp_ca_needs_ecn(sk)) {
 			/* ACK or retransmitted segment: clear ECT|CE */
 			INET_ECN_dontxmit(sk);
 		}
@@ -1601,7 +1604,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *buff;
-	int old_factor;
+	int old_factor, inflight_prev;
 	long limit;
 	int nlen;
 	u8 flags;
@@ -1676,6 +1679,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 
 		if (diff)
 			tcp_adjust_pcount(sk, skb, diff);
+
+		inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor;
+		if (inflight_prev < 0) {
+			WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious(
+					  old_factor,
+					  TCP_SKB_CB(skb)->sacked,
+					  TCP_SKB_CB(skb)->tx.in_flight),
+				  "inconsistent: tx.in_flight: %u "
+				  "old_factor: %d mss: %u sacked: %u "
+				  "1st pcount: %d 2nd pcount: %d "
+				  "1st len: %u 2nd len: %u ",
+				  TCP_SKB_CB(skb)->tx.in_flight, old_factor,
+				  mss_now, TCP_SKB_CB(skb)->sacked,
+				  tcp_skb_pcount(skb), tcp_skb_pcount(buff),
+				  skb->len, buff->len);
+			inflight_prev = 0;
+		}
+		/* Set 1st tx.in_flight as if 1st were sent by itself: */
+		TCP_SKB_CB(skb)->tx.in_flight = inflight_prev +
+						 tcp_skb_pcount(skb);
+		/* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */
+		TCP_SKB_CB(buff)->tx.in_flight = inflight_prev +
+						 tcp_skb_pcount(skb) +
+						 tcp_skb_pcount(buff);
 	}
 
 	/* Link BUFF into the send queue. */
@@ -2033,13 +2060,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
 static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
 {
 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
-	u32 min_tso, tso_segs;
-
-	min_tso = ca_ops->min_tso_segs ?
-			ca_ops->min_tso_segs(sk) :
-			READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
+	u32 tso_segs;
 
-	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
+	tso_segs = ca_ops->tso_segs ?
+		ca_ops->tso_segs(sk, mss_now) :
+		tcp_tso_autosize(sk, mss_now,
+				 sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
 	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
 }
 
@@ -2767,6 +2793,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
 			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
 			tcp_init_tso_segs(skb, mss_now);
+			tcp_set_tx_in_flight(sk, skb);
 			goto repair; /* Skip network transmission */
 		}
 
@@ -2981,6 +3008,7 @@ void tcp_send_loss_probe(struct sock *sk)
 	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
 		goto rearm_timer;
 
+	tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited;
 	if (__tcp_retransmit_skb(sk, skb, 1))
 		goto rearm_timer;
 
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index a8f6d9d06f2e..8737f2134648 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -34,6 +34,24 @@
  * ready to send in the write queue.
  */
 
+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 in_flight;
+
+	/* Check, sanitize, and record packets in flight after skb was sent. */
+	in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb);
+	if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX,
+		      "insane in_flight %u cc %s mss %u "
+		      "cwnd %u pif %u %u %u %u\n",
+		      in_flight, inet_csk(sk)->icsk_ca_ops->name,
+		      tp->mss_cache, tp->snd_cwnd,
+		      tp->packets_out, tp->retrans_out,
+		      tp->sacked_out, tp->lost_out))
+		in_flight = TCPCB_IN_FLIGHT_MAX;
+	TCP_SKB_CB(skb)->tx.in_flight = in_flight;
+}
+
 /* Snapshot the current delivery information in the skb, to generate
  * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
  */
@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
 	TCP_SKB_CB(skb)->tx.delivered_mstamp	= tp->delivered_mstamp;
 	TCP_SKB_CB(skb)->tx.delivered		= tp->delivered;
 	TCP_SKB_CB(skb)->tx.delivered_ce	= tp->delivered_ce;
+	TCP_SKB_CB(skb)->tx.lost		= tp->lost;
 	TCP_SKB_CB(skb)->tx.is_app_limited	= tp->app_limited ? 1 : 0;
+	tcp_set_tx_in_flight(sk, skb);
 }
 
 /* When an skb is sacked or acked, we fill in the rate sample with the (prior)
@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
 	if (!rs->prior_delivered ||
 	    tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
 			       scb->end_seq, rs->last_end_seq)) {
+		rs->prior_lost	     = scb->tx.lost;
 		rs->prior_delivered_ce  = scb->tx.delivered_ce;
 		rs->prior_delivered  = scb->tx.delivered;
 		rs->prior_mstamp     = scb->tx.delivered_mstamp;
 		rs->is_app_limited   = scb->tx.is_app_limited;
 		rs->is_retrans	     = scb->sacked & TCPCB_RETRANS;
+		rs->tx_in_flight     = scb->tx.in_flight;
 		rs->last_end_seq     = scb->end_seq;
 
 		/* Record send time of most recently ACKed packet: */
 		tp->first_tx_mstamp  = tx_tstamp;
 		/* Find the duration of the "send phase" of this window: */
-		rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
-						     scb->tx.first_tx_mstamp);
+		rs->interval_us      = tcp_stamp32_us_delta(
+						tp->first_tx_mstamp,
+						scb->tx.first_tx_mstamp);
 
 	}
 	/* Mark off the skb delivered once it's sacked to avoid being
@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 		return;
 	}
 	rs->delivered   = tp->delivered - rs->prior_delivered;
+	rs->lost        = tp->lost - rs->prior_lost;
 
 	rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
 	/* delivered_ce occupies less than 32 bits in the skb control block */
@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
 	 * longer phase.
 	 */
 	snd_us = rs->interval_us;				/* send phase */
-	ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
+	ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp,
 				    rs->prior_mstamp); /* ack phase */
 	rs->interval_us = max(snd_us, ack_us);
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 4d40615dc8fc..f27941201ef2 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -689,6 +689,7 @@ void tcp_write_timer_handler(struct sock *sk)
 		return;
 	}
 
+	tcp_rate_check_app_limited(sk);
 	tcp_mstamp_refresh(tcp_sk(sk));
 	event = icsk->icsk_pending;
 
-- 
2.46.0

From 9406487b80b3d574e142d130baf5b014e206aa13 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sun, 8 Sep 2024 10:19:33 +0200
Subject: [PATCH 03/12] block

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 block/bfq-iosched.c | 120 ++++++++++++++++++++++++++++++++++++--------
 block/bfq-iosched.h |  16 +++++-
 block/mq-deadline.c | 110 +++++++++++++++++++++++++++++++++-------
 3 files changed, 203 insertions(+), 43 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 4b88a54a9b76..88df08a246fa 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -467,6 +467,21 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
 	return icq;
 }
 
+static struct bfq_io_cq *bfq_bic_try_lookup(struct request_queue *q)
+{
+	if (!current->io_context)
+		return NULL;
+	if (spin_trylock_irq(&q->queue_lock)) {
+		struct bfq_io_cq *icq;
+
+		icq = icq_to_bic(ioc_lookup_icq(q));
+		spin_unlock_irq(&q->queue_lock);
+		return icq;
+	}
+
+	return NULL;
+}
+
 /*
  * Scheduler run of queue, if there are requests pending and no one in the
  * driver that will restart queueing.
@@ -2454,10 +2469,21 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
 	 * returned by bfq_bic_lookup does not go away before
 	 * bfqd->lock is taken.
 	 */
-	struct bfq_io_cq *bic = bfq_bic_lookup(q);
+	struct bfq_io_cq *bic = bfq_bic_try_lookup(q);
 	bool ret;
 
-	spin_lock_irq(&bfqd->lock);
+	/*
+	 * bio merging is called for every bio queued, and it's very easy
+	 * to run into contention because of that. If we fail getting
+	 * the dd lock, just skip this merge attempt. For related IO, the
+	 * plug will be the successful merging point. If we get here, we
+	 * already failed doing the obvious merge. Chances of actually
+	 * getting a merge off this path is a lot slimmer, so skipping an
+	 * occassional lookup that will most likely not succeed anyway should
+	 * not be a problem.
+	 */
+	if (!spin_trylock_irq(&bfqd->lock))
+		return false;
 
 	if (bic) {
 		/*
@@ -5148,6 +5174,10 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
 {
 	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
 
+	if (!list_empty_careful(&bfqd->at_head) ||
+	    !list_empty_careful(&bfqd->at_tail))
+		return true;
+
 	/*
 	 * Avoiding lock: a race on bfqd->queued should cause at
 	 * most a call to dispatch for nothing
@@ -5297,15 +5327,61 @@ static inline void bfq_update_dispatch_stats(struct request_queue *q,
 					     bool idle_timer_disabled) {}
 #endif /* CONFIG_BFQ_CGROUP_DEBUG */
 
+static void bfq_insert_request(struct request_queue *q, struct request *rq,
+			       blk_insert_t flags, struct list_head *free);
+
+static void __bfq_do_insert(struct request_queue *q, blk_insert_t flags,
+			    struct list_head *list, struct list_head *free)
+{
+	while (!list_empty(list)) {
+		struct request *rq;
+
+		rq = list_first_entry(list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		bfq_insert_request(q, rq, flags, free);
+	}
+}
+
+static void bfq_do_insert(struct request_queue *q, struct list_head *free)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	LIST_HEAD(at_head);
+	LIST_HEAD(at_tail);
+
+	spin_lock(&bfqd->insert_lock);
+	list_splice_init(&bfqd->at_head, &at_head);
+	list_splice_init(&bfqd->at_tail, &at_tail);
+	spin_unlock(&bfqd->insert_lock);
+
+	__bfq_do_insert(q, BLK_MQ_INSERT_AT_HEAD, &at_head, free);
+	__bfq_do_insert(q, 0, &at_tail, free);
+}
+
 static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
-	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+	struct request_queue *q = hctx->queue;
+	struct bfq_data *bfqd = q->elevator->elevator_data;
 	struct request *rq;
 	struct bfq_queue *in_serv_queue;
 	bool waiting_rq, idle_timer_disabled = false;
+	LIST_HEAD(free);
+
+	/*
+	 * If someone else is already dispatching, skip this one. This will
+	 * defer the next dispatch event to when something completes, and could
+	 * potentially lower the queue depth for contended cases.
+	 *
+	 * See the logic in blk_mq_do_dispatch_sched(), which loops and
+	 * retries if nothing is dispatched.
+	 */
+	if (test_bit(BFQ_DISPATCHING, &bfqd->run_state) ||
+	    test_and_set_bit_lock(BFQ_DISPATCHING, &bfqd->run_state))
+		return NULL;
 
 	spin_lock_irq(&bfqd->lock);
 
+	bfq_do_insert(hctx->queue, &free);
+
 	in_serv_queue = bfqd->in_service_queue;
 	waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
 
@@ -5315,7 +5391,9 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
 			waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
 	}
 
+	clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state);
 	spin_unlock_irq(&bfqd->lock);
+	blk_mq_free_requests(&free);
 	bfq_update_dispatch_stats(hctx->queue, rq,
 			idle_timer_disabled ? in_serv_queue : NULL,
 				idle_timer_disabled);
@@ -6236,27 +6314,21 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
 
 static struct bfq_queue *bfq_init_rq(struct request *rq);
 
-static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
-			       blk_insert_t flags)
+static void bfq_insert_request(struct request_queue *q, struct request *rq,
+			       blk_insert_t flags, struct list_head *free)
 {
-	struct request_queue *q = hctx->queue;
 	struct bfq_data *bfqd = q->elevator->elevator_data;
 	struct bfq_queue *bfqq;
 	bool idle_timer_disabled = false;
 	blk_opf_t cmd_flags;
-	LIST_HEAD(free);
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
 		bfqg_stats_update_legacy_io(q, rq);
 #endif
-	spin_lock_irq(&bfqd->lock);
 	bfqq = bfq_init_rq(rq);
-	if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
-		spin_unlock_irq(&bfqd->lock);
-		blk_mq_free_requests(&free);
+	if (blk_mq_sched_try_insert_merge(q, rq, free))
 		return;
-	}
 
 	trace_block_rq_insert(rq);
 
@@ -6286,8 +6358,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	 * merge).
 	 */
 	cmd_flags = rq->cmd_flags;
-	spin_unlock_irq(&bfqd->lock);
-
 	bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
 				cmd_flags);
 }
@@ -6296,13 +6366,15 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
 				struct list_head *list,
 				blk_insert_t flags)
 {
-	while (!list_empty(list)) {
-		struct request *rq;
+	struct request_queue *q = hctx->queue;
+	struct bfq_data *bfqd = q->elevator->elevator_data;
 
-		rq = list_first_entry(list, struct request, queuelist);
-		list_del_init(&rq->queuelist);
-		bfq_insert_request(hctx, rq, flags);
-	}
+	spin_lock_irq(&bfqd->insert_lock);
+	if (flags & BLK_MQ_INSERT_AT_HEAD)
+		list_splice_init(list, &bfqd->at_head);
+	else
+		list_splice_init(list, &bfqd->at_tail);
+	spin_unlock_irq(&bfqd->insert_lock);
 }
 
 static void bfq_update_hw_tag(struct bfq_data *bfqd)
@@ -7211,6 +7283,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	q->elevator = eq;
 	spin_unlock_irq(&q->queue_lock);
 
+	spin_lock_init(&bfqd->lock);
+	spin_lock_init(&bfqd->insert_lock);
+
+	INIT_LIST_HEAD(&bfqd->at_head);
+	INIT_LIST_HEAD(&bfqd->at_tail);
+
 	/*
 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
 	 * Grab a permanent reference to it, so that the normal code flow
@@ -7329,8 +7407,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	/* see comments on the definition of next field inside bfq_data */
 	bfqd->actuator_load_threshold = 4;
 
-	spin_lock_init(&bfqd->lock);
-
 	/*
 	 * The invocation of the next bfq_create_group_hierarchy
 	 * function is the head of a chain of function calls
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 467e8cfc41a2..f44f5d4ec2f4 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -504,12 +504,26 @@ struct bfq_io_cq {
 	unsigned int requests;	/* Number of requests this process has in flight */
 };
 
+enum {
+	BFQ_DISPATCHING	= 0,
+};
+
 /**
  * struct bfq_data - per-device data structure.
  *
  * All the fields are protected by @lock.
  */
 struct bfq_data {
+	struct {
+		spinlock_t lock;
+		spinlock_t insert_lock;
+	} ____cacheline_aligned_in_smp;
+
+	unsigned long run_state;
+
+	struct list_head at_head;
+	struct list_head at_tail;
+
 	/* device request queue */
 	struct request_queue *queue;
 	/* dispatch queue */
@@ -795,8 +809,6 @@ struct bfq_data {
 	/* fallback dummy bfqq for extreme OOM conditions */
 	struct bfq_queue oom_bfqq;
 
-	spinlock_t lock;
-
 	/*
 	 * bic associated with the task issuing current bio for
 	 * merging. This and the next field are used as a support to
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index acdc28756d9d..8b214233a061 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -79,10 +79,23 @@ struct dd_per_prio {
 	struct io_stats_per_prio stats;
 };
 
+enum {
+	DD_DISPATCHING	= 0,
+};
+
 struct deadline_data {
 	/*
 	 * run time data
 	 */
+	struct {
+		spinlock_t lock;
+		spinlock_t insert_lock;
+	} ____cacheline_aligned_in_smp;
+
+	unsigned long run_state;
+
+	struct list_head at_head;
+	struct list_head at_tail;
 
 	struct dd_per_prio per_prio[DD_PRIO_COUNT];
 
@@ -100,8 +113,6 @@ struct deadline_data {
 	int front_merges;
 	u32 async_depth;
 	int prio_aging_expire;
-
-	spinlock_t lock;
 };
 
 /* Maps an I/O priority class to a deadline scheduler priority. */
@@ -112,6 +123,9 @@ static const enum dd_prio ioprio_class_to_prio[] = {
 	[IOPRIO_CLASS_IDLE]	= DD_IDLE_PRIO,
 };
 
+static void dd_insert_request(struct request_queue *q, struct request *rq,
+			      blk_insert_t flags, struct list_head *free);
+
 static inline struct rb_root *
 deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
 {
@@ -451,6 +465,33 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
 	return NULL;
 }
 
+static void __dd_do_insert(struct request_queue *q, blk_insert_t flags,
+			   struct list_head *list, struct list_head *free)
+{
+	while (!list_empty(list)) {
+		struct request *rq;
+
+		rq = list_first_entry(list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		dd_insert_request(q, rq, flags, free);
+	}
+}
+
+static void dd_do_insert(struct request_queue *q, struct list_head *free)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+	LIST_HEAD(at_head);
+	LIST_HEAD(at_tail);
+
+	spin_lock(&dd->insert_lock);
+	list_splice_init(&dd->at_head, &at_head);
+	list_splice_init(&dd->at_tail, &at_tail);
+	spin_unlock(&dd->insert_lock);
+
+	__dd_do_insert(q, BLK_MQ_INSERT_AT_HEAD, &at_head, free);
+	__dd_do_insert(q, 0, &at_tail, free);
+}
+
 /*
  * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
  *
@@ -461,12 +502,27 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
  */
 static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
-	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+	struct request_queue *q = hctx->queue;
+	struct deadline_data *dd = q->elevator->elevator_data;
 	const unsigned long now = jiffies;
 	struct request *rq;
 	enum dd_prio prio;
+	LIST_HEAD(free);
+
+	/*
+	 * If someone else is already dispatching, skip this one. This will
+	 * defer the next dispatch event to when something completes, and could
+	 * potentially lower the queue depth for contended cases.
+	 *
+	 * See the logic in blk_mq_do_dispatch_sched(), which loops and
+	 * retries if nothing is dispatched.
+	 */
+	if (test_bit(DD_DISPATCHING, &dd->run_state) ||
+	    test_and_set_bit_lock(DD_DISPATCHING, &dd->run_state))
+		return NULL;
 
 	spin_lock(&dd->lock);
+	dd_do_insert(q, &free);
 	rq = dd_dispatch_prio_aged_requests(dd, now);
 	if (rq)
 		goto unlock;
@@ -482,8 +538,10 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 	}
 
 unlock:
+	clear_bit_unlock(DD_DISPATCHING, &dd->run_state);
 	spin_unlock(&dd->lock);
 
+	blk_mq_free_requests(&free);
 	return rq;
 }
 
@@ -585,6 +643,12 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 
 	eq->elevator_data = dd;
 
+	spin_lock_init(&dd->lock);
+	spin_lock_init(&dd->insert_lock);
+
+	INIT_LIST_HEAD(&dd->at_head);
+	INIT_LIST_HEAD(&dd->at_tail);
+
 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
 		struct dd_per_prio *per_prio = &dd->per_prio[prio];
 
@@ -601,7 +665,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 	dd->last_dir = DD_WRITE;
 	dd->fifo_batch = fifo_batch;
 	dd->prio_aging_expire = prio_aging_expire;
-	spin_lock_init(&dd->lock);
 
 	/* We dispatch from request queue wide instead of hw queue */
 	blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
@@ -657,7 +720,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
 	struct request *free = NULL;
 	bool ret;
 
-	spin_lock(&dd->lock);
+	/*
+	 * bio merging is called for every bio queued, and it's very easy
+	 * to run into contention because of that. If we fail getting
+	 * the dd lock, just skip this merge attempt. For related IO, the
+	 * plug will be the successful merging point. If we get here, we
+	 * already failed doing the obvious merge. Chances of actually
+	 * getting a merge off this path is a lot slimmer, so skipping an
+	 * occassional lookup that will most likely not succeed anyway should
+	 * not be a problem.
+	 */
+	if (!spin_trylock(&dd->lock))
+		return false;
+
 	ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
 	spin_unlock(&dd->lock);
 
@@ -670,10 +745,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
 /*
  * add rq to rbtree and fifo
  */
-static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+static void dd_insert_request(struct request_queue *q, struct request *rq,
 			      blk_insert_t flags, struct list_head *free)
 {
-	struct request_queue *q = hctx->queue;
 	struct deadline_data *dd = q->elevator->elevator_data;
 	const enum dd_data_dir data_dir = rq_data_dir(rq);
 	u16 ioprio = req_get_ioprio(rq);
@@ -727,19 +801,13 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
 {
 	struct request_queue *q = hctx->queue;
 	struct deadline_data *dd = q->elevator->elevator_data;
-	LIST_HEAD(free);
-
-	spin_lock(&dd->lock);
-	while (!list_empty(list)) {
-		struct request *rq;
-
-		rq = list_first_entry(list, struct request, queuelist);
-		list_del_init(&rq->queuelist);
-		dd_insert_request(hctx, rq, flags, &free);
-	}
-	spin_unlock(&dd->lock);
 
-	blk_mq_free_requests(&free);
+	spin_lock(&dd->insert_lock);
+	if (flags & BLK_MQ_INSERT_AT_HEAD)
+		list_splice_init(list, &dd->at_head);
+	else
+		list_splice_init(list, &dd->at_tail);
+	spin_unlock(&dd->insert_lock);
 }
 
 /* Callback from inside blk_mq_rq_ctx_init(). */
@@ -780,6 +848,10 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
 	enum dd_prio prio;
 
+	if (!list_empty_careful(&dd->at_head) ||
+	    !list_empty_careful(&dd->at_tail))
+		return true;
+
 	for (prio = 0; prio <= DD_PRIO_MAX; prio++)
 		if (dd_has_work_for_prio(&dd->per_prio[prio]))
 			return true;
-- 
2.46.0

From 46675daae50fe0767fac64e9be20b3523dd4fd22 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sun, 8 Sep 2024 10:20:15 +0200
Subject: [PATCH 04/12] cachy

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 .gitignore                                    |    6 +
 .../admin-guide/kernel-parameters.txt         |   12 +
 MAINTAINERS                                   |    7 +
 Makefile                                      |    9 +-
 arch/x86/Kconfig.cpu                          |  432 ++-
 arch/x86/Makefile                             |   45 +-
 arch/x86/include/asm/pci.h                    |    6 +
 arch/x86/include/asm/vermagic.h               |   76 +
 arch/x86/pci/common.c                         |    7 +-
 arch/x86/xen/setup.c                          |    5 +-
 block/bfq-iosched.c                           |    6 +
 block/elevator.c                              |   10 +
 drivers/Makefile                              |   13 +-
 drivers/ata/ahci.c                            |   23 +-
 drivers/cpufreq/Kconfig.x86                   |    2 -
 drivers/cpufreq/cpufreq.c                     |   27 +-
 drivers/cpufreq/intel_pstate.c                |    2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu.h           |    1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       |   10 +
 drivers/gpu/drm/amd/display/Kconfig           |    6 +
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |    2 +-
 .../amd/display/amdgpu_dm/amdgpu_dm_color.c   |    2 +-
 .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c    |    6 +-
 .../amd/display/amdgpu_dm/amdgpu_dm_plane.c   |    6 +-
 drivers/gpu/drm/amd/pm/amdgpu_pm.c            |    3 +
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     |   14 +-
 drivers/i2c/busses/Kconfig                    |    9 +
 drivers/i2c/busses/Makefile                   |    1 +
 drivers/i2c/busses/i2c-nct6775.c              |  648 ++++
 drivers/i2c/busses/i2c-piix4.c                |    4 +-
 drivers/input/evdev.c                         |   19 +-
 drivers/md/dm-crypt.c                         |    5 +
 drivers/media/v4l2-core/Kconfig               |    5 +
 drivers/media/v4l2-core/Makefile              |    2 +
 drivers/media/v4l2-core/v4l2loopback.c        | 3184 +++++++++++++++++
 drivers/media/v4l2-core/v4l2loopback.h        |   98 +
 .../media/v4l2-core/v4l2loopback_formats.h    |  445 +++
 drivers/pci/controller/Makefile               |    6 +
 drivers/pci/controller/intel-nvme-remap.c     |  462 +++
 drivers/pci/quirks.c                          |  101 +
 include/linux/cpufreq.h                       |    6 -
 include/linux/minmax.h                        |    7 +
 include/linux/pageblock-flags.h               |    4 +-
 include/linux/pagemap.h                       |    2 +-
 include/linux/user_namespace.h                |    4 +
 init/Kconfig                                  |   26 +
 kernel/Kconfig.hz                             |   24 +
 kernel/fork.c                                 |   14 +
 kernel/sched/fair.c                           |   13 +
 kernel/sched/sched.h                          |    2 +-
 kernel/sysctl.c                               |   12 +
 kernel/user_namespace.c                       |    7 +
 mm/Kconfig                                    |    2 +-
 mm/compaction.c                               |    4 +
 mm/huge_memory.c                              |    4 +
 mm/page-writeback.c                           |    8 +
 mm/page_alloc.c                               |    4 +
 mm/swap.c                                     |    5 +
 mm/vmpressure.c                               |    4 +
 mm/vmscan.c                                   |    8 +
 scripts/Makefile.package                      |   13 +
 scripts/package/PKGBUILD                      |  132 +
 62 files changed, 5933 insertions(+), 99 deletions(-)
 create mode 100644 drivers/i2c/busses/i2c-nct6775.c
 create mode 100644 drivers/media/v4l2-core/v4l2loopback.c
 create mode 100644 drivers/media/v4l2-core/v4l2loopback.h
 create mode 100644 drivers/media/v4l2-core/v4l2loopback_formats.h
 create mode 100644 drivers/pci/controller/intel-nvme-remap.c
 create mode 100644 scripts/package/PKGBUILD

diff --git a/.gitignore b/.gitignore
index c59dc60ba62e..7902adf4f7f1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -92,6 +92,12 @@ modules.order
 #
 /tar-install/
 
+#
+# pacman files (make pacman-pkg)
+#
+/PKGBUILD
+/pacman/
+
 #
 # We don't want to ignore the following even if they are dot-files
 #
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index c82446cef8e2..ab5ca3af35d2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2229,6 +2229,9 @@
 			disable
 			  Do not enable intel_pstate as the default
 			  scaling driver for the supported processors
+			enable
+			  Enable intel_pstate in-case "disable" was passed
+			  previously in the kernel boot parameters
                         active
                           Use intel_pstate driver to bypass the scaling
                           governors layer of cpufreq and provides it own
@@ -4447,6 +4450,15 @@
 		nomsi		[MSI] If the PCI_MSI kernel config parameter is
 				enabled, this kernel boot option can be used to
 				disable the use of MSI interrupts system-wide.
+		pcie_acs_override =
+					[PCIE] Override missing PCIe ACS support for:
+				downstream
+					All downstream ports - full ACS capabilities
+				multfunction
+					All multifunction devices - multifunction ACS subset
+				id:nnnn:nnnn
+					Specfic device - full ACS capabilities
+					Specified as vid:did (vendor/device ID) in hex
 		noioapicquirk	[APIC] Disable all boot interrupt quirks.
 				Safety option to keep boot IRQs enabled. This
 				should never be necessary.
diff --git a/MAINTAINERS b/MAINTAINERS
index 958e935449e5..b27470be2e6a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11978,6 +11978,13 @@ F:	include/uapi/linux/nfsd/
 F:	include/uapi/linux/sunrpc/
 F:	net/sunrpc/
 
+KERNEL PACMAN PACKAGING (in addition to generic KERNEL BUILD)
+M:	Thomas Weißschuh <linux@weissschuh.net>
+R:	Christian Heusel <christian@heusel.eu>
+R:	Nathan Chancellor <nathan@kernel.org>
+S:	Maintained
+F:	scripts/package/PKGBUILD
+
 KERNEL REGRESSIONS
 M:	Thorsten Leemhuis <linux@leemhuis.info>
 L:	regressions@lists.linux.dev
diff --git a/Makefile b/Makefile
index 9b4614c0fcbb..e4423f147101 100644
--- a/Makefile
+++ b/Makefile
@@ -817,6 +817,9 @@ KBUILD_CFLAGS	+= -fno-delete-null-pointer-checks
 ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
 KBUILD_CFLAGS += -O2
 KBUILD_RUSTFLAGS += -Copt-level=2
+else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3
+KBUILD_CFLAGS += -O3
+KBUILD_RUSTFLAGS += -Copt-level=3
 else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 KBUILD_CFLAGS += -Os
 KBUILD_RUSTFLAGS += -Copt-level=s
@@ -1005,9 +1008,9 @@ KBUILD_CFLAGS	+= -fno-strict-overflow
 # Make sure -fstack-check isn't enabled (like gentoo apparently did)
 KBUILD_CFLAGS  += -fno-stack-check
 
-# conserve stack if available
+# conserve stack, ivopts and modulo-sched if available
 ifdef CONFIG_CC_IS_GCC
-KBUILD_CFLAGS   += -fconserve-stack
+KBUILD_CFLAGS   += -fconserve-stack -fivopts -fmodulo-sched -fno-tree-vectorize
 endif
 
 # change __FILE__ to the relative path from the srctree
@@ -1497,7 +1500,7 @@ CLEAN_FILES += vmlinux.symvers modules-only.symvers \
 # Directories & files removed with 'make mrproper'
 MRPROPER_FILES += include/config include/generated          \
 		  arch/$(SRCARCH)/include/generated .objdiff \
-		  debian snap tar-install \
+		  debian snap tar-install PKGBUILD pacman \
 		  .config .config.old .version \
 		  Module.symvers \
 		  certs/signing_key.pem \
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 2a7279d80460..3b077b9f9291 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -157,7 +157,7 @@ config MPENTIUM4
 
 
 config MK6
-	bool "K6/K6-II/K6-III"
+	bool "AMD K6/K6-II/K6-III"
 	depends on X86_32
 	help
 	  Select this for an AMD K6-family processor.  Enables use of
@@ -165,7 +165,7 @@ config MK6
 	  flags to GCC.
 
 config MK7
-	bool "Athlon/Duron/K7"
+	bool "AMD Athlon/Duron/K7"
 	depends on X86_32
 	help
 	  Select this for an AMD Athlon K7-family processor.  Enables use of
@@ -173,12 +173,114 @@ config MK7
 	  flags to GCC.
 
 config MK8
-	bool "Opteron/Athlon64/Hammer/K8"
+	bool "AMD Opteron/Athlon64/Hammer/K8"
 	help
 	  Select this for an AMD Opteron or Athlon64 Hammer-family processor.
 	  Enables use of some extended instructions, and passes appropriate
 	  optimization flags to GCC.
 
+config MK8SSE3
+	bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3"
+	help
+	  Select this for improved AMD Opteron or Athlon64 Hammer-family processors.
+	  Enables use of some extended instructions, and passes appropriate
+	  optimization flags to GCC.
+
+config MK10
+	bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"
+	help
+	  Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,
+	  Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.
+	  Enables use of some extended instructions, and passes appropriate
+	  optimization flags to GCC.
+
+config MBARCELONA
+	bool "AMD Barcelona"
+	help
+	  Select this for AMD Family 10h Barcelona processors.
+
+	  Enables -march=barcelona
+
+config MBOBCAT
+	bool "AMD Bobcat"
+	help
+	  Select this for AMD Family 14h Bobcat processors.
+
+	  Enables -march=btver1
+
+config MJAGUAR
+	bool "AMD Jaguar"
+	help
+	  Select this for AMD Family 16h Jaguar processors.
+
+	  Enables -march=btver2
+
+config MBULLDOZER
+	bool "AMD Bulldozer"
+	help
+	  Select this for AMD Family 15h Bulldozer processors.
+
+	  Enables -march=bdver1
+
+config MPILEDRIVER
+	bool "AMD Piledriver"
+	help
+	  Select this for AMD Family 15h Piledriver processors.
+
+	  Enables -march=bdver2
+
+config MSTEAMROLLER
+	bool "AMD Steamroller"
+	help
+	  Select this for AMD Family 15h Steamroller processors.
+
+	  Enables -march=bdver3
+
+config MEXCAVATOR
+	bool "AMD Excavator"
+	help
+	  Select this for AMD Family 15h Excavator processors.
+
+	  Enables -march=bdver4
+
+config MZEN
+	bool "AMD Zen"
+	help
+	  Select this for AMD Family 17h Zen processors.
+
+	  Enables -march=znver1
+
+config MZEN2
+	bool "AMD Zen 2"
+	help
+	  Select this for AMD Family 17h Zen 2 processors.
+
+	  Enables -march=znver2
+
+config MZEN3
+	bool "AMD Zen 3"
+	depends on (CC_IS_GCC && GCC_VERSION >= 100300) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	help
+	  Select this for AMD Family 19h Zen 3 processors.
+
+	  Enables -march=znver3
+
+config MZEN4
+	bool "AMD Zen 4"
+	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 160000)
+	help
+	  Select this for AMD Family 19h Zen 4 processors.
+
+	  Enables -march=znver4
+
+config MZEN5
+	bool "AMD Zen 5"
+	depends on (CC_IS_GCC && GCC_VERSION >= 140000) || (CC_IS_CLANG && CLANG_VERSION >= 180000)
+	help
+	  Select this for AMD Family 1Ah Zen 5 processors.
+
+	  Enables -march=znver5
+
 config MCRUSOE
 	bool "Crusoe"
 	depends on X86_32
@@ -270,7 +372,7 @@ config MPSC
 	  in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
 
 config MCORE2
-	bool "Core 2/newer Xeon"
+	bool "Intel Core 2"
 	help
 
 	  Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
@@ -278,6 +380,8 @@ config MCORE2
 	  family in /proc/cpuinfo. Newer ones have 6 and older ones 15
 	  (not a typo)
 
+	  Enables -march=core2
+
 config MATOM
 	bool "Intel Atom"
 	help
@@ -287,6 +391,212 @@ config MATOM
 	  accordingly optimized code. Use a recent GCC with specific Atom
 	  support in order to fully benefit from selecting this option.
 
+config MNEHALEM
+	bool "Intel Nehalem"
+	select X86_P6_NOP
+	help
+
+	  Select this for 1st Gen Core processors in the Nehalem family.
+
+	  Enables -march=nehalem
+
+config MWESTMERE
+	bool "Intel Westmere"
+	select X86_P6_NOP
+	help
+
+	  Select this for the Intel Westmere formerly Nehalem-C family.
+
+	  Enables -march=westmere
+
+config MSILVERMONT
+	bool "Intel Silvermont"
+	select X86_P6_NOP
+	help
+
+	  Select this for the Intel Silvermont platform.
+
+	  Enables -march=silvermont
+
+config MGOLDMONT
+	bool "Intel Goldmont"
+	select X86_P6_NOP
+	help
+
+	  Select this for the Intel Goldmont platform including Apollo Lake and Denverton.
+
+	  Enables -march=goldmont
+
+config MGOLDMONTPLUS
+	bool "Intel Goldmont Plus"
+	select X86_P6_NOP
+	help
+
+	  Select this for the Intel Goldmont Plus platform including Gemini Lake.
+
+	  Enables -march=goldmont-plus
+
+config MSANDYBRIDGE
+	bool "Intel Sandy Bridge"
+	select X86_P6_NOP
+	help
+
+	  Select this for 2nd Gen Core processors in the Sandy Bridge family.
+
+	  Enables -march=sandybridge
+
+config MIVYBRIDGE
+	bool "Intel Ivy Bridge"
+	select X86_P6_NOP
+	help
+
+	  Select this for 3rd Gen Core processors in the Ivy Bridge family.
+
+	  Enables -march=ivybridge
+
+config MHASWELL
+	bool "Intel Haswell"
+	select X86_P6_NOP
+	help
+
+	  Select this for 4th Gen Core processors in the Haswell family.
+
+	  Enables -march=haswell
+
+config MBROADWELL
+	bool "Intel Broadwell"
+	select X86_P6_NOP
+	help
+
+	  Select this for 5th Gen Core processors in the Broadwell family.
+
+	  Enables -march=broadwell
+
+config MSKYLAKE
+	bool "Intel Skylake"
+	select X86_P6_NOP
+	help
+
+	  Select this for 6th Gen Core processors in the Skylake family.
+
+	  Enables -march=skylake
+
+config MSKYLAKEX
+	bool "Intel Skylake X"
+	select X86_P6_NOP
+	help
+
+	  Select this for 6th Gen Core processors in the Skylake X family.
+
+	  Enables -march=skylake-avx512
+
+config MCANNONLAKE
+	bool "Intel Cannon Lake"
+	select X86_P6_NOP
+	help
+
+	  Select this for 8th Gen Core processors
+
+	  Enables -march=cannonlake
+
+config MICELAKE
+	bool "Intel Ice Lake"
+	select X86_P6_NOP
+	help
+
+	  Select this for 10th Gen Core processors in the Ice Lake family.
+
+	  Enables -march=icelake-client
+
+config MCASCADELAKE
+	bool "Intel Cascade Lake"
+	select X86_P6_NOP
+	help
+
+	  Select this for Xeon processors in the Cascade Lake family.
+
+	  Enables -march=cascadelake
+
+config MCOOPERLAKE
+	bool "Intel Cooper Lake"
+	depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
+	select X86_P6_NOP
+	help
+
+	  Select this for Xeon processors in the Cooper Lake family.
+
+	  Enables -march=cooperlake
+
+config MTIGERLAKE
+	bool "Intel Tiger Lake"
+	depends on  (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000)
+	select X86_P6_NOP
+	help
+
+	  Select this for third-generation 10 nm process processors in the Tiger Lake family.
+
+	  Enables -march=tigerlake
+
+config MSAPPHIRERAPIDS
+	bool "Intel Sapphire Rapids"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	select X86_P6_NOP
+	help
+
+	  Select this for fourth-generation 10 nm process processors in the Sapphire Rapids family.
+
+	  Enables -march=sapphirerapids
+
+config MROCKETLAKE
+	bool "Intel Rocket Lake"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	select X86_P6_NOP
+	help
+
+	  Select this for eleventh-generation processors in the Rocket Lake family.
+
+	  Enables -march=rocketlake
+
+config MALDERLAKE
+	bool "Intel Alder Lake"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	select X86_P6_NOP
+	help
+
+	  Select this for twelfth-generation processors in the Alder Lake family.
+
+	  Enables -march=alderlake
+
+config MRAPTORLAKE
+	bool "Intel Raptor Lake"
+	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+	select X86_P6_NOP
+	help
+
+	  Select this for thirteenth-generation processors in the Raptor Lake family.
+
+	  Enables -march=raptorlake
+
+config MMETEORLAKE
+	bool "Intel Meteor Lake"
+	depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+	select X86_P6_NOP
+	help
+
+	  Select this for fourteenth-generation processors in the Meteor Lake family.
+
+	  Enables -march=meteorlake
+
+config MEMERALDRAPIDS
+	bool "Intel Emerald Rapids"
+	depends on (CC_IS_GCC && GCC_VERSION > 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500)
+	select X86_P6_NOP
+	help
+
+	  Select this for fifth-generation 10 nm process processors in the Emerald Rapids family.
+
+	  Enables -march=emeraldrapids
+
 config GENERIC_CPU
 	bool "Generic-x86-64"
 	depends on X86_64
@@ -294,6 +604,50 @@ config GENERIC_CPU
 	  Generic x86-64 CPU.
 	  Run equally well on all x86-64 CPUs.
 
+config GENERIC_CPU2
+	bool "Generic-x86-64-v2"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	depends on X86_64
+	help
+	  Generic x86-64 CPU.
+	  Run equally well on all x86-64 CPUs with min support of x86-64-v2.
+
+config GENERIC_CPU3
+	bool "Generic-x86-64-v3"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	depends on X86_64
+	help
+	  Generic x86-64-v3 CPU with v3 instructions.
+	  Run equally well on all x86-64 CPUs with min support of x86-64-v3.
+
+config GENERIC_CPU4
+	bool "Generic-x86-64-v4"
+	depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000)
+	depends on X86_64
+	help
+	  Generic x86-64 CPU with v4 instructions.
+	  Run equally well on all x86-64 CPUs with min support of x86-64-v4.
+
+config MNATIVE_INTEL
+	bool "Intel-Native optimizations autodetected by the compiler"
+	help
+
+	  Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
+	  the optimum settings to use based on your processor. Do NOT use this
+	  for AMD CPUs.  Intel Only!
+
+	  Enables -march=native
+
+config MNATIVE_AMD
+	bool "AMD-Native optimizations autodetected by the compiler"
+	help
+
+	  Clang 3.8, GCC 4.2 and above support -march=native, which automatically detects
+	  the optimum settings to use based on your processor. Do NOT use this
+	  for Intel CPUs.  AMD Only!
+
+	  Enables -march=native
+
 endchoice
 
 config X86_GENERIC
@@ -318,9 +672,17 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC
-	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
+	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || MK8SSE3 || MK10 \
+	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \
+	|| MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT \
+	|| MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \
+	|| MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
+	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE \
+	|| MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD || X86_GENERIC || GENERIC_CPU || GENERIC_CPU2 \
+	|| GENERIC_CPU3 || GENERIC_CPU4
 	default "4" if MELAN || M486SX || M486 || MGEODEGX1
-	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
+	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII \
+	|| MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
 
 config X86_F00F_BUG
 	def_bool y
@@ -332,15 +694,27 @@ config X86_INVD_BUG
 
 config X86_ALIGNMENT_16
 	def_bool y
-	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1
+	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC \
+	|| M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1
 
 config X86_INTEL_USERCOPY
 	def_bool y
-	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
+	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC \
+	|| MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
+	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \
+	|| MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \
+	|| MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL
 
 config X86_USE_PPRO_CHECKSUM
 	def_bool y
-	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
+	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \
+	|| MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX \
+	|| MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \
+	|| MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM \
+	|| MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE \
+	|| MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE \
+	|| MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \
+	|| MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
 
 #
 # P6_NOPs are a relatively minor optimization that require a family >=
@@ -356,11 +730,22 @@ config X86_USE_PPRO_CHECKSUM
 config X86_P6_NOP
 	def_bool y
 	depends on X86_64
-	depends on (MCORE2 || MPENTIUM4 || MPSC)
+	depends on (MCORE2 || MPENTIUM4 || MPSC || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
+	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE \
+	|| MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE \
+	|| MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS \
+	|| MNATIVE_INTEL)
 
 config X86_TSC
 	def_bool y
-	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
+	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \
+	|| MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 \
+	|| MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \
+	|| MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM \
+	|| MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL \
+	|| MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
+	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS \
+	|| MNATIVE_INTEL || MNATIVE_AMD) || X86_64
 
 config X86_HAVE_PAE
 	def_bool y
@@ -368,18 +753,37 @@ config X86_HAVE_PAE
 
 config X86_CMPXCHG64
 	def_bool y
-	depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7
+	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
+	|| M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 || MK8SSE3 || MK10 \
+	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN \
+	|| MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS \
+	|| MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE \
+	|| MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \
+	|| MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD
 
 # this should be set for all -march=.. options where the compiler
 # generates cmov.
 config X86_CMOV
 	def_bool y
-	depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
+	depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
+	|| MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX || MK8SSE3 || MK10 \
+	|| MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR \
+	|| MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \
+	|| MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \
+	|| MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \
+	|| MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD)
 
 config X86_MINIMUM_CPU_FAMILY
 	int
 	default "64" if X86_64
-	default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8)
+	default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \
+	|| MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8 ||  MK8SSE3 \
+	|| MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \
+	|| MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT \
+	|| MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \
+	|| MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \
+	|| MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MRAPTORLAKE \
+	|| MNATIVE_INTEL || MNATIVE_AMD)
 	default "5" if X86_32 && X86_CMPXCHG64
 	default "4"
 
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 801fd85c3ef6..93cc88b59cbb 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -176,8 +176,49 @@ else
         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
         cflags-$(CONFIG_MK8)		+= -march=k8
         cflags-$(CONFIG_MPSC)		+= -march=nocona
-        cflags-$(CONFIG_MCORE2)		+= -march=core2
-        cflags-$(CONFIG_MATOM)		+= -march=atom
+        cflags-$(CONFIG_MK8SSE3)	+= -march=k8-sse3
+        cflags-$(CONFIG_MK10) 		+= -march=amdfam10
+        cflags-$(CONFIG_MBARCELONA) 	+= -march=barcelona
+        cflags-$(CONFIG_MBOBCAT) 	+= -march=btver1
+        cflags-$(CONFIG_MJAGUAR) 	+= -march=btver2
+        cflags-$(CONFIG_MBULLDOZER) 	+= -march=bdver1
+        cflags-$(CONFIG_MPILEDRIVER)	+= -march=bdver2 -mno-tbm
+        cflags-$(CONFIG_MSTEAMROLLER) 	+= -march=bdver3 -mno-tbm
+        cflags-$(CONFIG_MEXCAVATOR) 	+= -march=bdver4 -mno-tbm
+        cflags-$(CONFIG_MZEN) 		+= -march=znver1
+        cflags-$(CONFIG_MZEN2) 	+= -march=znver2
+        cflags-$(CONFIG_MZEN3) 	+= -march=znver3
+        cflags-$(CONFIG_MZEN4) 	+= -march=znver4
+        cflags-$(CONFIG_MZEN5) 	+= -march=znver5
+        cflags-$(CONFIG_MNATIVE_INTEL) += -march=native
+        cflags-$(CONFIG_MNATIVE_AMD) 	+= -march=native
+        cflags-$(CONFIG_MATOM) 	+= -march=bonnell
+        cflags-$(CONFIG_MCORE2) 	+= -march=core2
+        cflags-$(CONFIG_MNEHALEM) 	+= -march=nehalem
+        cflags-$(CONFIG_MWESTMERE) 	+= -march=westmere
+        cflags-$(CONFIG_MSILVERMONT) 	+= -march=silvermont
+        cflags-$(CONFIG_MGOLDMONT) 	+= -march=goldmont
+        cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus
+        cflags-$(CONFIG_MSANDYBRIDGE) 	+= -march=sandybridge
+        cflags-$(CONFIG_MIVYBRIDGE) 	+= -march=ivybridge
+        cflags-$(CONFIG_MHASWELL) 	+= -march=haswell
+        cflags-$(CONFIG_MBROADWELL) 	+= -march=broadwell
+        cflags-$(CONFIG_MSKYLAKE) 	+= -march=skylake
+        cflags-$(CONFIG_MSKYLAKEX) 	+= -march=skylake-avx512
+        cflags-$(CONFIG_MCANNONLAKE) 	+= -march=cannonlake
+        cflags-$(CONFIG_MICELAKE) 	+= -march=icelake-client
+        cflags-$(CONFIG_MCASCADELAKE) 	+= -march=cascadelake
+        cflags-$(CONFIG_MCOOPERLAKE) 	+= -march=cooperlake
+        cflags-$(CONFIG_MTIGERLAKE) 	+= -march=tigerlake
+        cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids
+        cflags-$(CONFIG_MROCKETLAKE) 	+= -march=rocketlake
+        cflags-$(CONFIG_MALDERLAKE) 	+= -march=alderlake
+        cflags-$(CONFIG_MRAPTORLAKE) 	+= -march=raptorlake
+        cflags-$(CONFIG_MMETEORLAKE) 	+= -march=meteorlake
+        cflags-$(CONFIG_MEMERALDRAPIDS)	+= -march=emeraldrapids
+        cflags-$(CONFIG_GENERIC_CPU2) 	+= -march=x86-64-v2
+        cflags-$(CONFIG_GENERIC_CPU3) 	+= -march=x86-64-v3
+        cflags-$(CONFIG_GENERIC_CPU4) 	+= -march=x86-64-v4
         cflags-$(CONFIG_GENERIC_CPU)	+= -mtune=generic
         KBUILD_CFLAGS += $(cflags-y)
 
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index b3ab80a03365..5e883b397ff3 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -26,6 +26,7 @@ struct pci_sysdata {
 #if IS_ENABLED(CONFIG_VMD)
 	struct pci_dev	*vmd_dev;	/* VMD Device if in Intel VMD domain */
 #endif
+	struct pci_dev	*nvme_remap_dev;	/* AHCI Device if NVME remapped bus */
 };
 
 extern int pci_routeirq;
@@ -69,6 +70,11 @@ static inline bool is_vmd(struct pci_bus *bus)
 #define is_vmd(bus)		false
 #endif /* CONFIG_VMD */
 
+static inline bool is_nvme_remap(struct pci_bus *bus)
+{
+	return to_pci_sysdata(bus)->nvme_remap_dev != NULL;
+}
+
 /* Can be used to override the logic in pci_scan_bus for skipping
    already-configured bus numbers - to be used for buggy BIOSes
    or architectures with incomplete PCI setup by the loader */
diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h
index 75884d2cdec3..7acca9b5a9d5 100644
--- a/arch/x86/include/asm/vermagic.h
+++ b/arch/x86/include/asm/vermagic.h
@@ -17,6 +17,54 @@
 #define MODULE_PROC_FAMILY "586MMX "
 #elif defined CONFIG_MCORE2
 #define MODULE_PROC_FAMILY "CORE2 "
+#elif defined CONFIG_MNATIVE_INTEL
+#define MODULE_PROC_FAMILY "NATIVE_INTEL "
+#elif defined CONFIG_MNATIVE_AMD
+#define MODULE_PROC_FAMILY "NATIVE_AMD "
+#elif defined CONFIG_MNEHALEM
+#define MODULE_PROC_FAMILY "NEHALEM "
+#elif defined CONFIG_MWESTMERE
+#define MODULE_PROC_FAMILY "WESTMERE "
+#elif defined CONFIG_MSILVERMONT
+#define MODULE_PROC_FAMILY "SILVERMONT "
+#elif defined CONFIG_MGOLDMONT
+#define MODULE_PROC_FAMILY "GOLDMONT "
+#elif defined CONFIG_MGOLDMONTPLUS
+#define MODULE_PROC_FAMILY "GOLDMONTPLUS "
+#elif defined CONFIG_MSANDYBRIDGE
+#define MODULE_PROC_FAMILY "SANDYBRIDGE "
+#elif defined CONFIG_MIVYBRIDGE
+#define MODULE_PROC_FAMILY "IVYBRIDGE "
+#elif defined CONFIG_MHASWELL
+#define MODULE_PROC_FAMILY "HASWELL "
+#elif defined CONFIG_MBROADWELL
+#define MODULE_PROC_FAMILY "BROADWELL "
+#elif defined CONFIG_MSKYLAKE
+#define MODULE_PROC_FAMILY "SKYLAKE "
+#elif defined CONFIG_MSKYLAKEX
+#define MODULE_PROC_FAMILY "SKYLAKEX "
+#elif defined CONFIG_MCANNONLAKE
+#define MODULE_PROC_FAMILY "CANNONLAKE "
+#elif defined CONFIG_MICELAKE
+#define MODULE_PROC_FAMILY "ICELAKE "
+#elif defined CONFIG_MCASCADELAKE
+#define MODULE_PROC_FAMILY "CASCADELAKE "
+#elif defined CONFIG_MCOOPERLAKE
+#define MODULE_PROC_FAMILY "COOPERLAKE "
+#elif defined CONFIG_MTIGERLAKE
+#define MODULE_PROC_FAMILY "TIGERLAKE "
+#elif defined CONFIG_MSAPPHIRERAPIDS
+#define MODULE_PROC_FAMILY "SAPPHIRERAPIDS "
+#elif defined CONFIG_ROCKETLAKE
+#define MODULE_PROC_FAMILY "ROCKETLAKE "
+#elif defined CONFIG_MALDERLAKE
+#define MODULE_PROC_FAMILY "ALDERLAKE "
+#elif defined CONFIG_MRAPTORLAKE
+#define MODULE_PROC_FAMILY "RAPTORLAKE "
+#elif defined CONFIG_MMETEORLAKE
+#define MODULE_PROC_FAMILY "METEORLAKE "
+#elif defined CONFIG_MEMERALDRAPIDS
+#define MODULE_PROC_FAMILY "EMERALDRAPIDS "
 #elif defined CONFIG_MATOM
 #define MODULE_PROC_FAMILY "ATOM "
 #elif defined CONFIG_M686
@@ -35,6 +83,34 @@
 #define MODULE_PROC_FAMILY "K7 "
 #elif defined CONFIG_MK8
 #define MODULE_PROC_FAMILY "K8 "
+#elif defined CONFIG_MK8SSE3
+#define MODULE_PROC_FAMILY "K8SSE3 "
+#elif defined CONFIG_MK10
+#define MODULE_PROC_FAMILY "K10 "
+#elif defined CONFIG_MBARCELONA
+#define MODULE_PROC_FAMILY "BARCELONA "
+#elif defined CONFIG_MBOBCAT
+#define MODULE_PROC_FAMILY "BOBCAT "
+#elif defined CONFIG_MBULLDOZER
+#define MODULE_PROC_FAMILY "BULLDOZER "
+#elif defined CONFIG_MPILEDRIVER
+#define MODULE_PROC_FAMILY "PILEDRIVER "
+#elif defined CONFIG_MSTEAMROLLER
+#define MODULE_PROC_FAMILY "STEAMROLLER "
+#elif defined CONFIG_MJAGUAR
+#define MODULE_PROC_FAMILY "JAGUAR "
+#elif defined CONFIG_MEXCAVATOR
+#define MODULE_PROC_FAMILY "EXCAVATOR "
+#elif defined CONFIG_MZEN
+#define MODULE_PROC_FAMILY "ZEN "
+#elif defined CONFIG_MZEN2
+#define MODULE_PROC_FAMILY "ZEN2 "
+#elif defined CONFIG_MZEN3
+#define MODULE_PROC_FAMILY "ZEN3 "
+#elif defined CONFIG_MZEN4
+#define MODULE_PROC_FAMILY "ZEN4 "
+#elif defined CONFIG_MZEN5
+#define MODULE_PROC_FAMILY "ZEN5 "
 #elif defined CONFIG_MELAN
 #define MODULE_PROC_FAMILY "ELAN "
 #elif defined CONFIG_MCRUSOE
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index ddb798603201..7c20387d8202 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void)
 		return 0;
 }
 
-#if IS_ENABLED(CONFIG_VMD)
 struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
 {
+#if IS_ENABLED(CONFIG_VMD)
 	if (is_vmd(dev->bus))
 		return to_pci_sysdata(dev->bus)->vmd_dev;
+#endif
+
+	if (is_nvme_remap(dev->bus))
+		return to_pci_sysdata(dev->bus)->nvme_remap_dev;
 
 	return dev;
 }
-#endif
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 380591028cb8..f6f5aa569367 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -691,6 +691,7 @@ char * __init xen_memory_setup(void)
 	struct xen_memory_map memmap;
 	unsigned long max_pages;
 	unsigned long extra_pages = 0;
+	unsigned long maxmem_pages;
 	int i;
 	int op;
 
@@ -762,8 +763,8 @@ char * __init xen_memory_setup(void)
 	 * Make sure we have no memory above max_pages, as this area
 	 * isn't handled by the p2m management.
 	 */
-	extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
-			   extra_pages, max_pages - max_pfn);
+	maxmem_pages = EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM));
+	extra_pages = min3(maxmem_pages, extra_pages, max_pages - max_pfn);
 	i = 0;
 	addr = xen_e820_table.entries[0].addr;
 	size = xen_e820_table.entries[0].size;
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 88df08a246fa..deecce63d0fc 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -7703,6 +7703,7 @@ MODULE_ALIAS("bfq-iosched");
 static int __init bfq_init(void)
 {
 	int ret;
+	char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.10";
 
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 	ret = blkcg_policy_register(&blkcg_policy_bfq);
@@ -7734,6 +7735,11 @@ static int __init bfq_init(void)
 	if (ret)
 		goto slab_kill;
 
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	strcat(msg, " (with cgroups support)");
+#endif
+	pr_info("%s", msg);
+
 	return 0;
 
 slab_kill:
diff --git a/block/elevator.c b/block/elevator.c
index f64ebd726e58..4f1ccf8cf250 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -567,9 +567,19 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
 
 	if (q->nr_hw_queues != 1 &&
 	    !blk_mq_is_shared_tags(q->tag_set->flags))
+#if defined(CONFIG_CACHY) && defined(CONFIG_MQ_IOSCHED_KYBER)
+		return elevator_find_get(q, "kyber");
+#elif defined(CONFIG_CACHY)
+		return elevator_find_get(q, "mq-deadline");
+#else
 		return NULL;
+#endif
 
+#if defined(CONFIG_CACHY) && defined(CONFIG_IOSCHED_BFQ)
+	return elevator_find_get(q, "bfq");
+#else
 	return elevator_find_get(q, "mq-deadline");
+#endif
 }
 
 /*
diff --git a/drivers/Makefile b/drivers/Makefile
index fe9ceb0d2288..b58955caf19b 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -61,14 +61,8 @@ obj-y				+= char/
 # iommu/ comes before gpu as gpu are using iommu controllers
 obj-y				+= iommu/
 
-# gpu/ comes after char for AGP vs DRM startup and after iommu
-obj-y				+= gpu/
-
 obj-$(CONFIG_CONNECTOR)		+= connector/
 
-# i810fb depends on char/agp/
-obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
-
 obj-$(CONFIG_PARPORT)		+= parport/
 obj-y				+= base/ block/ misc/ mfd/ nfc/
 obj-$(CONFIG_LIBNVDIMM)		+= nvdimm/
@@ -80,6 +74,13 @@ obj-y				+= macintosh/
 obj-y				+= scsi/
 obj-y				+= nvme/
 obj-$(CONFIG_ATA)		+= ata/
+
+# gpu/ comes after char for AGP vs DRM startup and after iommu
+obj-y				+= gpu/
+
+# i810fb depends on char/agp/
+obj-$(CONFIG_FB_I810)           += video/fbdev/i810/
+
 obj-$(CONFIG_TARGET_CORE)	+= target/
 obj-$(CONFIG_MTD)		+= mtd/
 obj-$(CONFIG_SPI)		+= spi/
diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index fc6fd583faf8..f79e205a51dd 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1618,7 +1618,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
 }
 #endif
 
-static void ahci_remap_check(struct pci_dev *pdev, int bar,
+static int ahci_remap_check(struct pci_dev *pdev, int bar,
 		struct ahci_host_priv *hpriv)
 {
 	int i;
@@ -1631,7 +1631,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
 	    pci_resource_len(pdev, bar) < SZ_512K ||
 	    bar != AHCI_PCI_BAR_STANDARD ||
 	    !(readl(hpriv->mmio + AHCI_VSCAP) & 1))
-		return;
+		return 0;
 
 	cap = readq(hpriv->mmio + AHCI_REMAP_CAP);
 	for (i = 0; i < AHCI_MAX_REMAP; i++) {
@@ -1646,18 +1646,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar,
 	}
 
 	if (!hpriv->remapped_nvme)
-		return;
-
-	dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n",
-		 hpriv->remapped_nvme);
-	dev_warn(&pdev->dev,
-		 "Switch your BIOS from RAID to AHCI mode to use them.\n");
+		return 0;
 
-	/*
-	 * Don't rely on the msi-x capability in the remap case,
-	 * share the legacy interrupt across ahci and remapped devices.
-	 */
-	hpriv->flags |= AHCI_HFLAG_NO_MSI;
+	/* Abort probe, allowing intel-nvme-remap to step in when available */
+	dev_info(&pdev->dev, "Device will be handled by intel-nvme-remap.\n");
+	return -ENODEV;
 }
 
 static int ahci_get_irq_vector(struct ata_host *host, int port)
@@ -1894,7 +1887,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar];
 
 	/* detect remapped nvme devices */
-	ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+	rc = ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+	if (rc)
+		return rc;
 
 	sysfs_add_file_to_group(&pdev->dev.kobj,
 				&dev_attr_remapped_nvme.attr,
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index 97c2d4f15d76..5a3af44d785a 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -9,7 +9,6 @@ config X86_INTEL_PSTATE
 	select ACPI_PROCESSOR if ACPI
 	select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
 	select CPU_FREQ_GOV_PERFORMANCE
-	select CPU_FREQ_GOV_SCHEDUTIL if SMP
 	help
 	  This driver provides a P state for Intel core processors.
 	  The driver implements an internal governor and will become
@@ -39,7 +38,6 @@ config X86_AMD_PSTATE
 	depends on X86 && ACPI
 	select ACPI_PROCESSOR
 	select ACPI_CPPC_LIB if X86_64
-	select CPU_FREQ_GOV_SCHEDUTIL if SMP
 	help
 	  This driver adds a CPUFreq driver which utilizes a fine grain
 	  processor performance frequency control range instead of legacy
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 270ea04fb616..d7ae2a6bcd7b 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -575,30 +575,11 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy)
 		return policy->transition_delay_us;
 
 	latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
-	if (latency) {
-		unsigned int max_delay_us = 2 * MSEC_PER_SEC;
+	if (latency)
+		/* Give a 50% breathing room between updates */
+		return latency + (latency >> 1);
 
-		/*
-		 * If the platform already has high transition_latency, use it
-		 * as-is.
-		 */
-		if (latency > max_delay_us)
-			return latency;
-
-		/*
-		 * For platforms that can change the frequency very fast (< 2
-		 * us), the above formula gives a decent transition delay. But
-		 * for platforms where transition_latency is in milliseconds, it
-		 * ends up giving unrealistic values.
-		 *
-		 * Cap the default transition delay to 2 ms, which seems to be
-		 * a reasonable amount of time after which we should reevaluate
-		 * the frequency.
-		 */
-		return min(latency * LATENCY_MULTIPLIER, max_delay_us);
-	}
-
-	return LATENCY_MULTIPLIER;
+	return USEC_PER_MSEC;
 }
 EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us);
 
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c31914a9876f..1035c074f36a 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -3550,6 +3550,8 @@ static int __init intel_pstate_setup(char *str)
 
 	if (!strcmp(str, "disable"))
 		no_load = 1;
+	else if (!strcmp(str, "enable"))
+		no_load = 0;
 	else if (!strcmp(str, "active"))
 		default_driver = &intel_pstate;
 	else if (!strcmp(str, "passive"))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f87d53e183c3..c489d3b2576b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -159,6 +159,7 @@ struct amdgpu_watchdog_timer {
  */
 extern int amdgpu_modeset;
 extern unsigned int amdgpu_vram_limit;
+extern int amdgpu_ignore_min_pcap;
 extern int amdgpu_vis_vram_limit;
 extern int amdgpu_gart_size;
 extern int amdgpu_gtt_size;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index ea14f1c8f430..bb0b636d0d75 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -132,6 +132,7 @@ enum AMDGPU_DEBUG_MASK {
 };
 
 unsigned int amdgpu_vram_limit = UINT_MAX;
+int amdgpu_ignore_min_pcap = 0; /* do not ignore by default */
 int amdgpu_vis_vram_limit;
 int amdgpu_gart_size = -1; /* auto */
 int amdgpu_gtt_size = -1; /* auto */
@@ -243,6 +244,15 @@ struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
 	.period = 0x0, /* default to 0x0 (timeout disable) */
 };
 
+/**
+ * DOC: ignore_min_pcap (int)
+ * Ignore the minimum power cap.
+ * Useful on graphics cards where the minimum power cap is very high.
+ * The default is 0 (Do not ignore).
+ */
+MODULE_PARM_DESC(ignore_min_pcap, "Ignore the minimum power cap");
+module_param_named(ignore_min_pcap, amdgpu_ignore_min_pcap, int, 0600);
+
 /**
  * DOC: vramlimit (int)
  * Restrict the total amount of VRAM in MiB for testing.  The default is 0 (Use full VRAM).
diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig
index 47b8b49da8a7..943959d1f401 100644
--- a/drivers/gpu/drm/amd/display/Kconfig
+++ b/drivers/gpu/drm/amd/display/Kconfig
@@ -51,4 +51,10 @@ config DRM_AMD_SECURE_DISPLAY
 	  This option enables the calculation of crc of specific region via
 	  debugfs. Cooperate with specific DMCU FW.
 
+config AMD_PRIVATE_COLOR
+	bool "Enable KMS color management by AMD for AMD"
+	default n
+	help
+	  This option extends the KMS color management API with AMD driver-specific properties to enhance the color management support on AMD Steam Deck.
+
 endmenu
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 27e641f17628..7b3635fd2805 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -4129,7 +4129,7 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev)
 		return r;
 	}
 
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 	if (amdgpu_dm_create_color_properties(adev)) {
 		dc_state_release(state->context);
 		kfree(state);
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
index ebabfe3a512f..4d3ebcaacca1 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
@@ -97,7 +97,7 @@ static inline struct fixed31_32 amdgpu_dm_fixpt_from_s3132(__u64 x)
 	return val;
 }
 
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 /* Pre-defined Transfer Functions (TF)
  *
  * AMD driver supports pre-defined mathematical functions for transferring
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
index e23a0a276e33..dd83cf50a89b 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
@@ -338,7 +338,7 @@ static int amdgpu_dm_crtc_late_register(struct drm_crtc *crtc)
 }
 #endif
 
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 /**
  * dm_crtc_additional_color_mgmt - enable additional color properties
  * @crtc: DRM CRTC
@@ -420,7 +420,7 @@ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = {
 #if defined(CONFIG_DEBUG_FS)
 	.late_register = amdgpu_dm_crtc_late_register,
 #endif
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 	.atomic_set_property = amdgpu_dm_atomic_crtc_set_property,
 	.atomic_get_property = amdgpu_dm_atomic_crtc_get_property,
 #endif
@@ -599,7 +599,7 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm,
 
 	drm_mode_crtc_set_gamma_size(&acrtc->base, MAX_COLOR_LEGACY_LUT_ENTRIES);
 
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 	dm_crtc_additional_color_mgmt(&acrtc->base);
 #endif
 	return 0;
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
index 7d47acdd11d5..16acd2abdc9a 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
@@ -1476,7 +1476,7 @@ static void amdgpu_dm_plane_drm_plane_destroy_state(struct drm_plane *plane,
 	drm_atomic_helper_plane_destroy_state(plane, state);
 }
 
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 static void
 dm_atomic_plane_attach_color_mgmt_properties(struct amdgpu_display_manager *dm,
 					     struct drm_plane *plane)
@@ -1667,7 +1667,7 @@ static const struct drm_plane_funcs dm_plane_funcs = {
 	.atomic_duplicate_state = amdgpu_dm_plane_drm_plane_duplicate_state,
 	.atomic_destroy_state = amdgpu_dm_plane_drm_plane_destroy_state,
 	.format_mod_supported = amdgpu_dm_plane_format_mod_supported,
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 	.atomic_set_property = dm_atomic_plane_set_property,
 	.atomic_get_property = dm_atomic_plane_get_property,
 #endif
@@ -1750,7 +1750,7 @@ int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm,
 
 	drm_plane_helper_add(plane, &dm_plane_helper_funcs);
 
-#ifdef AMD_PRIVATE_COLOR
+#ifdef CONFIG_AMD_PRIVATE_COLOR
 	dm_atomic_plane_attach_color_mgmt_properties(dm, plane);
 #endif
 	/* Create (reset) the plane state */
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index c11952a4389b..52f54a228b39 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -3155,6 +3155,9 @@ static ssize_t amdgpu_hwmon_show_power_cap_min(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
 {
+	if (amdgpu_ignore_min_pcap)
+		return sysfs_emit(buf, "%i\n", 0);
+
 	return amdgpu_hwmon_show_power_cap_generic(dev, attr, buf, PP_PWR_LIMIT_MIN);
 }
 
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 45d053497f49..996b4d13805a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -2752,7 +2752,10 @@ int smu_get_power_limit(void *handle,
 			*limit = smu->max_power_limit;
 			break;
 		case SMU_PPT_LIMIT_MIN:
-			*limit = smu->min_power_limit;
+			if (amdgpu_ignore_min_pcap)
+				*limit = 0;
+			else
+				*limit = smu->min_power_limit;
 			break;
 		default:
 			return -EINVAL;
@@ -2776,7 +2779,14 @@ static int smu_set_power_limit(void *handle, uint32_t limit)
 		if (smu->ppt_funcs->set_power_limit)
 			return smu->ppt_funcs->set_power_limit(smu, limit_type, limit);
 
-	if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) {
+	if (amdgpu_ignore_min_pcap) {
+		if ((limit > smu->max_power_limit)) {
+			dev_err(smu->adev->dev,
+				"New power limit (%d) is over the max allowed %d\n",
+				limit, smu->max_power_limit);
+			return -EINVAL;
+		}
+	} else if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) {
 		dev_err(smu->adev->dev,
 			"New power limit (%d) is out of range [%d,%d]\n",
 			limit, smu->min_power_limit, smu->max_power_limit);
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index fe6e8a1bb607..1488a904e3bf 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -238,6 +238,15 @@ config I2C_CHT_WC
 	  combined with a FUSB302 Type-C port-controller as such it is advised
 	  to also select CONFIG_TYPEC_FUSB302=m.
 
+config I2C_NCT6775
+	tristate "Nuvoton NCT6775 and compatible SMBus controller"
+	help
+		If you say yes to this option, support will be included for the
+		Nuvoton NCT6775 and compatible SMBus controllers.
+
+		This driver can also be built as a module.  If so, the module
+		will be called i2c-nct6775.
+
 config I2C_NFORCE2
 	tristate "Nvidia nForce2, nForce3 and nForce4"
 	depends on PCI && HAS_IOPORT
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index 78d0561339e5..9ea3a294f9f0 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC)	+= i2c-cht-wc.o
 obj-$(CONFIG_I2C_I801)		+= i2c-i801.o
 obj-$(CONFIG_I2C_ISCH)		+= i2c-isch.o
 obj-$(CONFIG_I2C_ISMT)		+= i2c-ismt.o
+obj-$(CONFIG_I2C_NCT6775)   += i2c-nct6775.o
 obj-$(CONFIG_I2C_NFORCE2)	+= i2c-nforce2.o
 obj-$(CONFIG_I2C_NFORCE2_S4985)	+= i2c-nforce2-s4985.o
 obj-$(CONFIG_I2C_NVIDIA_GPU)	+= i2c-nvidia-gpu.o
diff --git a/drivers/i2c/busses/i2c-nct6775.c b/drivers/i2c/busses/i2c-nct6775.c
new file mode 100644
index 000000000000..fdbd9a1c8d7a
--- /dev/null
+++ b/drivers/i2c/busses/i2c-nct6775.c
@@ -0,0 +1,648 @@
+/*
+ * i2c-nct6775 - Driver for the SMBus master functionality of
+ *	       Nuvoton NCT677x Super-I/O chips
+ *
+ * Copyright (C) 2019  Adam Honse <calcprogrammer1@gmail.com>
+ *
+ * Derived from nct6775 hwmon driver
+ * Copyright (C) 2012  Guenter Roeck <linux@roeck-us.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/jiffies.h>
+#include <linux/platform_device.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/hwmon-vid.h>
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/delay.h>
+#include <linux/ioport.h>
+#include <linux/i2c.h>
+#include <linux/acpi.h>
+#include <linux/bitops.h>
+#include <linux/dmi.h>
+#include <linux/io.h>
+#include <linux/nospec.h>
+
+#define DRVNAME "i2c-nct6775"
+
+/* Nuvoton SMBus address offsets */
+#define SMBHSTDAT       (0 + nuvoton_nct6793d_smba)
+#define SMBBLKSZ        (1 + nuvoton_nct6793d_smba)
+#define SMBHSTCMD       (2 + nuvoton_nct6793d_smba)
+#define SMBHSTIDX       (3 + nuvoton_nct6793d_smba)  //Index field is the Command field on other controllers
+#define SMBHSTCTL       (4 + nuvoton_nct6793d_smba)
+#define SMBHSTADD       (5 + nuvoton_nct6793d_smba)
+#define SMBHSTERR       (9 + nuvoton_nct6793d_smba)
+#define SMBHSTSTS       (0xE + nuvoton_nct6793d_smba)
+
+/* Command register */
+#define NCT6793D_READ_BYTE      0
+#define NCT6793D_READ_WORD      1
+#define NCT6793D_READ_BLOCK     2
+#define NCT6793D_BLOCK_WRITE_READ_PROC_CALL 3
+#define NCT6793D_PROC_CALL      4
+#define NCT6793D_WRITE_BYTE     8
+#define NCT6793D_WRITE_WORD     9
+#define NCT6793D_WRITE_BLOCK    10
+
+/* Control register */
+#define NCT6793D_MANUAL_START   128
+#define NCT6793D_SOFT_RESET     64
+
+/* Error register */
+#define NCT6793D_NO_ACK         32
+
+/* Status register */
+#define NCT6793D_FIFO_EMPTY     1
+#define NCT6793D_FIFO_FULL      2
+#define NCT6793D_MANUAL_ACTIVE  4
+
+#define NCT6775_LD_SMBUS		0x0B
+
+/* Other settings */
+#define MAX_RETRIES		400
+
+enum kinds { nct6106, nct6775, nct6776, nct6779, nct6791, nct6792, nct6793,
+	     nct6795, nct6796, nct6798 };
+
+struct nct6775_sio_data {
+	int sioreg;
+	enum kinds kind;
+};
+
+/* used to set data->name = nct6775_device_names[data->sio_kind] */
+static const char * const nct6775_device_names[] = {
+	"nct6106",
+	"nct6775",
+	"nct6776",
+	"nct6779",
+	"nct6791",
+	"nct6792",
+	"nct6793",
+	"nct6795",
+	"nct6796",
+	"nct6798",
+};
+
+static const char * const nct6775_sio_names[] __initconst = {
+	"NCT6106D",
+	"NCT6775F",
+	"NCT6776D/F",
+	"NCT6779D",
+	"NCT6791D",
+	"NCT6792D",
+	"NCT6793D",
+	"NCT6795D",
+	"NCT6796D",
+	"NCT6798D",
+};
+
+#define SIO_REG_LDSEL		0x07	/* Logical device select */
+#define SIO_REG_DEVID		0x20	/* Device ID (2 bytes) */
+#define SIO_REG_SMBA		0x62	/* SMBus base address register */
+
+#define SIO_NCT6106_ID		0xc450
+#define SIO_NCT6775_ID		0xb470
+#define SIO_NCT6776_ID		0xc330
+#define SIO_NCT6779_ID		0xc560
+#define SIO_NCT6791_ID		0xc800
+#define SIO_NCT6792_ID		0xc910
+#define SIO_NCT6793_ID		0xd120
+#define SIO_NCT6795_ID		0xd350
+#define SIO_NCT6796_ID		0xd420
+#define SIO_NCT6798_ID		0xd428
+#define SIO_ID_MASK			0xFFF0
+
+static inline void
+superio_outb(int ioreg, int reg, int val)
+{
+	outb(reg, ioreg);
+	outb(val, ioreg + 1);
+}
+
+static inline int
+superio_inb(int ioreg, int reg)
+{
+	outb(reg, ioreg);
+	return inb(ioreg + 1);
+}
+
+static inline void
+superio_select(int ioreg, int ld)
+{
+	outb(SIO_REG_LDSEL, ioreg);
+	outb(ld, ioreg + 1);
+}
+
+static inline int
+superio_enter(int ioreg)
+{
+	/*
+	 * Try to reserve <ioreg> and <ioreg + 1> for exclusive access.
+	 */
+	if (!request_muxed_region(ioreg, 2, DRVNAME))
+		return -EBUSY;
+
+	outb(0x87, ioreg);
+	outb(0x87, ioreg);
+
+	return 0;
+}
+
+static inline void
+superio_exit(int ioreg)
+{
+	outb(0xaa, ioreg);
+	outb(0x02, ioreg);
+	outb(0x02, ioreg + 1);
+	release_region(ioreg, 2);
+}
+
+/*
+ * ISA constants
+ */
+
+#define IOREGION_ALIGNMENT	(~7)
+#define IOREGION_LENGTH		2
+#define ADDR_REG_OFFSET		0
+#define DATA_REG_OFFSET		1
+
+#define NCT6775_REG_BANK	0x4E
+#define NCT6775_REG_CONFIG	0x40
+
+static struct i2c_adapter *nct6775_adapter;
+
+struct i2c_nct6775_adapdata {
+	unsigned short smba;
+};
+
+/* Return negative errno on error. */
+static s32 nct6775_access(struct i2c_adapter * adap, u16 addr,
+		 unsigned short flags, char read_write,
+		 u8 command, int size, union i2c_smbus_data * data)
+{
+	struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap);
+	unsigned short nuvoton_nct6793d_smba = adapdata->smba;
+	int i, len, cnt;
+	union i2c_smbus_data tmp_data;
+	int timeout = 0;
+
+	tmp_data.word = 0;
+	cnt = 0;
+	len = 0;
+
+	outb_p(NCT6793D_SOFT_RESET, SMBHSTCTL);
+
+	switch (size) {
+		case I2C_SMBUS_QUICK:
+			outb_p((addr << 1) | read_write,
+			       SMBHSTADD);
+			break;
+		case I2C_SMBUS_BYTE_DATA:
+			tmp_data.byte = data->byte;
+			fallthrough;
+		case I2C_SMBUS_BYTE:
+			outb_p((addr << 1) | read_write,
+			       SMBHSTADD);
+			outb_p(command, SMBHSTIDX);
+			if (read_write == I2C_SMBUS_WRITE) {
+				outb_p(tmp_data.byte, SMBHSTDAT);
+				outb_p(NCT6793D_WRITE_BYTE, SMBHSTCMD);
+			}
+			else {
+				outb_p(NCT6793D_READ_BYTE, SMBHSTCMD);
+			}
+			break;
+		case I2C_SMBUS_WORD_DATA:
+			outb_p((addr << 1) | read_write,
+			       SMBHSTADD);
+			outb_p(command, SMBHSTIDX);
+			if (read_write == I2C_SMBUS_WRITE) {
+				outb_p(data->word & 0xff, SMBHSTDAT);
+				outb_p((data->word & 0xff00) >> 8, SMBHSTDAT);
+				outb_p(NCT6793D_WRITE_WORD, SMBHSTCMD);
+			}
+			else {
+				outb_p(NCT6793D_READ_WORD, SMBHSTCMD);
+			}
+			break;
+		case I2C_SMBUS_BLOCK_DATA:
+			outb_p((addr << 1) | read_write,
+			       SMBHSTADD);
+			outb_p(command, SMBHSTIDX);
+			if (read_write == I2C_SMBUS_WRITE) {
+				len = data->block[0];
+				if (len == 0 || len > I2C_SMBUS_BLOCK_MAX)
+					return -EINVAL;
+				outb_p(len, SMBBLKSZ);
+
+				cnt = 1;
+				if (len >= 4) {
+					for (i = cnt; i <= 4; i++) {
+						outb_p(data->block[i], SMBHSTDAT);
+					}
+
+					len -= 4;
+					cnt += 4;
+				}
+				else {
+					for (i = cnt; i <= len; i++ ) {
+						outb_p(data->block[i], SMBHSTDAT);
+					}
+
+					len = 0;
+				}
+
+				outb_p(NCT6793D_WRITE_BLOCK, SMBHSTCMD);
+			}
+			else {
+				return -ENOTSUPP;
+			}
+			break;
+		default:
+			dev_warn(&adap->dev, "Unsupported transaction %d\n", size);
+			return -EOPNOTSUPP;
+	}
+
+	outb_p(NCT6793D_MANUAL_START, SMBHSTCTL);
+
+	while ((size == I2C_SMBUS_BLOCK_DATA) && (len > 0)) {
+		if (read_write == I2C_SMBUS_WRITE) {
+			timeout = 0;
+			while ((inb_p(SMBHSTSTS) & NCT6793D_FIFO_EMPTY) == 0)
+			{
+				if(timeout > MAX_RETRIES)
+				{
+					return -ETIMEDOUT;
+				}
+				usleep_range(250, 500);
+				timeout++;
+			}
+
+			//Load more bytes into FIFO
+			if (len >= 4) {
+				for (i = cnt; i <= (cnt + 4); i++) {
+					outb_p(data->block[i], SMBHSTDAT);
+				}
+
+				len -= 4;
+				cnt += 4;
+			}
+			else {
+				for (i = cnt; i <= (cnt + len); i++) {
+					outb_p(data->block[i], SMBHSTDAT);
+				}
+
+				len = 0;
+			}
+		}
+		else {
+			return -ENOTSUPP;
+		}
+		
+	}
+
+	//wait for manual mode to complete
+	timeout = 0;
+	while ((inb_p(SMBHSTSTS) & NCT6793D_MANUAL_ACTIVE) != 0)
+	{
+		if(timeout > MAX_RETRIES)
+		{
+			return -ETIMEDOUT;
+		}
+		usleep_range(250, 500);
+		timeout++;
+	}
+
+	if ((inb_p(SMBHSTERR) & NCT6793D_NO_ACK) != 0) {    	
+		return -ENXIO;
+	}
+	else if ((read_write == I2C_SMBUS_WRITE) || (size == I2C_SMBUS_QUICK)) {
+		return 0;
+	}
+
+	switch (size) {
+		case I2C_SMBUS_QUICK:
+		case I2C_SMBUS_BYTE_DATA:
+			data->byte = inb_p(SMBHSTDAT);
+			break;
+		case I2C_SMBUS_WORD_DATA:
+			data->word = inb_p(SMBHSTDAT) + (inb_p(SMBHSTDAT) << 8);
+			break;
+	}
+	return 0;
+}
+
+static u32 nct6775_func(struct i2c_adapter *adapter)
+{
+	return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE |
+	    I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA |
+	    I2C_FUNC_SMBUS_BLOCK_DATA;
+}
+
+static const struct i2c_algorithm smbus_algorithm = {
+	.smbus_xfer	= nct6775_access,
+	.functionality	= nct6775_func,
+};
+
+static int nct6775_add_adapter(unsigned short smba, const char *name, struct i2c_adapter **padap)
+{
+	struct i2c_adapter *adap;
+	struct i2c_nct6775_adapdata *adapdata;
+	int retval;
+
+	adap = kzalloc(sizeof(*adap), GFP_KERNEL);
+	if (adap == NULL) {
+		return -ENOMEM;
+	}
+
+	adap->owner = THIS_MODULE;
+	adap->class = I2C_CLASS_HWMON;
+	adap->algo = &smbus_algorithm;
+
+	adapdata = kzalloc(sizeof(*adapdata), GFP_KERNEL);
+	if (adapdata == NULL) {
+		kfree(adap);
+		return -ENOMEM;
+	}
+
+	adapdata->smba = smba;
+
+	snprintf(adap->name, sizeof(adap->name),
+		"SMBus NCT67xx adapter%s at %04x", name, smba);
+
+	i2c_set_adapdata(adap, adapdata);
+
+	retval = i2c_add_adapter(adap);
+	if (retval) {
+		kfree(adapdata);
+		kfree(adap);
+		return retval;
+	}
+
+	*padap = adap;
+	return 0;
+}
+
+static void nct6775_remove_adapter(struct i2c_adapter *adap)
+{
+	struct i2c_nct6775_adapdata *adapdata = i2c_get_adapdata(adap);
+
+	if (adapdata->smba) {
+		i2c_del_adapter(adap);
+		kfree(adapdata);
+		kfree(adap);
+	}
+}
+
+//static SIMPLE_DEV_PM_OPS(nct6775_dev_pm_ops, nct6775_suspend, nct6775_resume);
+
+/*
+ * when Super-I/O functions move to a separate file, the Super-I/O
+ * bus will manage the lifetime of the device and this module will only keep
+ * track of the nct6775 driver. But since we use platform_device_alloc(), we
+ * must keep track of the device
+ */
+static struct platform_device *pdev[2];
+
+static int nct6775_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct nct6775_sio_data *sio_data = dev_get_platdata(dev);
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+	if (!devm_request_region(&pdev->dev, res->start, IOREGION_LENGTH,
+				 DRVNAME))
+		return -EBUSY;
+
+	switch (sio_data->kind) {
+	case nct6791:
+	case nct6792:
+	case nct6793:
+	case nct6795:
+	case nct6796:
+	case nct6798:
+		nct6775_add_adapter(res->start, "", &nct6775_adapter);
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	return 0;
+}
+/*
+static void nct6791_enable_io_mapping(int sioaddr)
+{
+	int val;
+
+	val = superio_inb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE);
+	if (val & 0x10) {
+		pr_info("Enabling hardware monitor logical device mappings.\n");
+		superio_outb(sioaddr, NCT6791_REG_HM_IO_SPACE_LOCK_ENABLE,
+			     val & ~0x10);
+	}
+}*/
+
+static struct platform_driver i2c_nct6775_driver = {
+	.driver = {
+		.name	= DRVNAME,
+//		.pm	= &nct6775_dev_pm_ops,
+	},
+	.probe		= nct6775_probe,
+};
+
+static void __exit i2c_nct6775_exit(void)
+{
+	int i;
+
+	if(nct6775_adapter)
+		nct6775_remove_adapter(nct6775_adapter);
+
+	for (i = 0; i < ARRAY_SIZE(pdev); i++) {
+		if (pdev[i])
+			platform_device_unregister(pdev[i]);
+	}
+	platform_driver_unregister(&i2c_nct6775_driver);
+}
+
+/* nct6775_find() looks for a '627 in the Super-I/O config space */
+static int __init nct6775_find(int sioaddr, struct nct6775_sio_data *sio_data)
+{
+	u16 val;
+	int err;
+	int addr;
+
+	err = superio_enter(sioaddr);
+	if (err)
+		return err;
+
+	val = (superio_inb(sioaddr, SIO_REG_DEVID) << 8) |
+		superio_inb(sioaddr, SIO_REG_DEVID + 1);
+
+	switch (val & SIO_ID_MASK) {
+	case SIO_NCT6106_ID:
+		sio_data->kind = nct6106;
+		break;
+	case SIO_NCT6775_ID:
+		sio_data->kind = nct6775;
+		break;
+	case SIO_NCT6776_ID:
+		sio_data->kind = nct6776;
+		break;
+	case SIO_NCT6779_ID:
+		sio_data->kind = nct6779;
+		break;
+	case SIO_NCT6791_ID:
+		sio_data->kind = nct6791;
+		break;
+	case SIO_NCT6792_ID:
+		sio_data->kind = nct6792;
+		break;
+	case SIO_NCT6793_ID:
+		sio_data->kind = nct6793;
+		break;
+	case SIO_NCT6795_ID:
+		sio_data->kind = nct6795;
+		break;
+	case SIO_NCT6796_ID:
+		sio_data->kind = nct6796;
+		break;
+	case SIO_NCT6798_ID:
+		sio_data->kind = nct6798;
+		break;
+	default:
+		if (val != 0xffff)
+			pr_debug("unsupported chip ID: 0x%04x\n", val);
+		superio_exit(sioaddr);
+		return -ENODEV;
+	}
+
+	/* We have a known chip, find the SMBus I/O address */
+	superio_select(sioaddr, NCT6775_LD_SMBUS);
+	val = (superio_inb(sioaddr, SIO_REG_SMBA) << 8)
+	    | superio_inb(sioaddr, SIO_REG_SMBA + 1);
+	addr = val & IOREGION_ALIGNMENT;
+	if (addr == 0) {
+		pr_err("Refusing to enable a Super-I/O device with a base I/O port 0\n");
+		superio_exit(sioaddr);
+		return -ENODEV;
+	}
+
+	//if (sio_data->kind == nct6791 || sio_data->kind == nct6792 ||
+	//    sio_data->kind == nct6793 || sio_data->kind == nct6795 ||
+	//    sio_data->kind == nct6796)
+	//	nct6791_enable_io_mapping(sioaddr);
+
+	superio_exit(sioaddr);
+	pr_info("Found %s or compatible chip at %#x:%#x\n",
+		nct6775_sio_names[sio_data->kind], sioaddr, addr);
+	sio_data->sioreg = sioaddr;
+
+	return addr;
+}
+
+static int __init i2c_nct6775_init(void)
+{
+	int i, err;
+	bool found = false;
+	int address;
+	struct resource res;
+	struct nct6775_sio_data sio_data;
+	int sioaddr[2] = { 0x2e, 0x4e };
+
+	err = platform_driver_register(&i2c_nct6775_driver);
+	if (err)
+		return err;
+
+	/*
+	 * initialize sio_data->kind and sio_data->sioreg.
+	 *
+	 * when Super-I/O functions move to a separate file, the Super-I/O
+	 * driver will probe 0x2e and 0x4e and auto-detect the presence of a
+	 * nct6775 hardware monitor, and call probe()
+	 */
+	for (i = 0; i < ARRAY_SIZE(pdev); i++) {
+		address = nct6775_find(sioaddr[i], &sio_data);
+		if (address <= 0)
+			continue;
+
+		found = true;
+
+		pdev[i] = platform_device_alloc(DRVNAME, address);
+		if (!pdev[i]) {
+			err = -ENOMEM;
+			goto exit_device_unregister;
+		}
+
+		err = platform_device_add_data(pdev[i], &sio_data,
+					       sizeof(struct nct6775_sio_data));
+		if (err)
+			goto exit_device_put;
+
+		memset(&res, 0, sizeof(res));
+		res.name = DRVNAME;
+		res.start = address;
+		res.end = address + IOREGION_LENGTH - 1;
+		res.flags = IORESOURCE_IO;
+
+		err = acpi_check_resource_conflict(&res);
+		if (err) {
+			platform_device_put(pdev[i]);
+			pdev[i] = NULL;
+			continue;
+		}
+
+		err = platform_device_add_resources(pdev[i], &res, 1);
+		if (err)
+			goto exit_device_put;
+
+		/* platform_device_add calls probe() */
+		err = platform_device_add(pdev[i]);
+		if (err)
+			goto exit_device_put;
+	}
+	if (!found) {
+		err = -ENODEV;
+		goto exit_unregister;
+	}
+
+	return 0;
+
+exit_device_put:
+	platform_device_put(pdev[i]);
+exit_device_unregister:
+	while (--i >= 0) {
+		if (pdev[i])
+			platform_device_unregister(pdev[i]);
+	}
+exit_unregister:
+	platform_driver_unregister(&i2c_nct6775_driver);
+	return err;
+}
+
+MODULE_AUTHOR("Adam Honse <calcprogrammer1@gmail.com>");
+MODULE_DESCRIPTION("SMBus driver for NCT6775F and compatible chips");
+MODULE_LICENSE("GPL");
+
+module_init(i2c_nct6775_init);
+module_exit(i2c_nct6775_exit);
diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
index 6a0392172b2f..e7dd007bf6b1 100644
--- a/drivers/i2c/busses/i2c-piix4.c
+++ b/drivers/i2c/busses/i2c-piix4.c
@@ -568,11 +568,11 @@ static int piix4_transaction(struct i2c_adapter *piix4_adapter)
 	if (srvrworks_csb5_delay) /* Extra delay for SERVERWORKS_CSB5 */
 		usleep_range(2000, 2100);
 	else
-		usleep_range(250, 500);
+		usleep_range(25, 50);
 
 	while ((++timeout < MAX_TIMEOUT) &&
 	       ((temp = inb_p(SMBHSTSTS)) & 0x01))
-		usleep_range(250, 500);
+		usleep_range(25, 50);
 
 	/* If the SMBus is still busy, we give up */
 	if (timeout == MAX_TIMEOUT) {
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 51e0c4954600..35c3ad741870 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -46,6 +46,7 @@ struct evdev_client {
 	struct fasync_struct *fasync;
 	struct evdev *evdev;
 	struct list_head node;
+	struct rcu_head rcu;
 	enum input_clock_type clk_type;
 	bool revoked;
 	unsigned long *evmasks[EV_CNT];
@@ -377,13 +378,22 @@ static void evdev_attach_client(struct evdev *evdev,
 	spin_unlock(&evdev->client_lock);
 }
 
+static void evdev_reclaim_client(struct rcu_head *rp)
+{
+	struct evdev_client *client = container_of(rp, struct evdev_client, rcu);
+	unsigned int i;
+	for (i = 0; i < EV_CNT; ++i)
+		bitmap_free(client->evmasks[i]);
+	kvfree(client);
+}
+
 static void evdev_detach_client(struct evdev *evdev,
 				struct evdev_client *client)
 {
 	spin_lock(&evdev->client_lock);
 	list_del_rcu(&client->node);
 	spin_unlock(&evdev->client_lock);
-	synchronize_rcu();
+	call_rcu(&client->rcu, evdev_reclaim_client);
 }
 
 static int evdev_open_device(struct evdev *evdev)
@@ -436,7 +446,6 @@ static int evdev_release(struct inode *inode, struct file *file)
 {
 	struct evdev_client *client = file->private_data;
 	struct evdev *evdev = client->evdev;
-	unsigned int i;
 
 	mutex_lock(&evdev->mutex);
 
@@ -448,11 +457,6 @@ static int evdev_release(struct inode *inode, struct file *file)
 
 	evdev_detach_client(evdev, client);
 
-	for (i = 0; i < EV_CNT; ++i)
-		bitmap_free(client->evmasks[i]);
-
-	kvfree(client);
-
 	evdev_close_device(evdev);
 
 	return 0;
@@ -495,7 +499,6 @@ static int evdev_open(struct inode *inode, struct file *file)
 
  err_free_client:
 	evdev_detach_client(evdev, client);
-	kvfree(client);
 	return error;
 }
 
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 1b7a97cc3779..37e9e43908ab 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -3284,6 +3284,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 			goto bad;
 	}
 
+#ifdef CONFIG_CACHY
+	set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags);
+	set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
+#endif
+
 	ret = crypt_ctr_cipher(ti, argv[0], argv[1]);
 	if (ret < 0)
 		goto bad;
diff --git a/drivers/media/v4l2-core/Kconfig b/drivers/media/v4l2-core/Kconfig
index 331b8e535e5b..80dabeebf580 100644
--- a/drivers/media/v4l2-core/Kconfig
+++ b/drivers/media/v4l2-core/Kconfig
@@ -40,6 +40,11 @@ config VIDEO_TUNER
 config V4L2_JPEG_HELPER
 	tristate
 
+config V4L2_LOOPBACK
+	tristate "V4L2 loopback device"
+	help
+	  V4L2 loopback device
+
 # Used by drivers that need v4l2-h264.ko
 config V4L2_H264
 	tristate
diff --git a/drivers/media/v4l2-core/Makefile b/drivers/media/v4l2-core/Makefile
index 2177b9d63a8f..c179507cedc4 100644
--- a/drivers/media/v4l2-core/Makefile
+++ b/drivers/media/v4l2-core/Makefile
@@ -33,5 +33,7 @@ obj-$(CONFIG_V4L2_JPEG_HELPER) += v4l2-jpeg.o
 obj-$(CONFIG_V4L2_MEM2MEM_DEV) += v4l2-mem2mem.o
 obj-$(CONFIG_V4L2_VP9) += v4l2-vp9.o
 
+obj-$(CONFIG_V4L2_LOOPBACK) += v4l2loopback.o
+
 obj-$(CONFIG_VIDEO_TUNER) += tuner.o
 obj-$(CONFIG_VIDEO_DEV) += v4l2-dv-timings.o videodev.o
diff --git a/drivers/media/v4l2-core/v4l2loopback.c b/drivers/media/v4l2-core/v4l2loopback.c
new file mode 100644
index 000000000000..25cb1beb26e5
--- /dev/null
+++ b/drivers/media/v4l2-core/v4l2loopback.c
@@ -0,0 +1,3184 @@
+/* -*- c-file-style: "linux" -*- */
+/*
+ * v4l2loopback.c  --  video4linux2 loopback driver
+ *
+ * Copyright (C) 2005-2009 Vasily Levin (vasaka@gmail.com)
+ * Copyright (C) 2010-2023 IOhannes m zmoelnig (zmoelnig@iem.at)
+ * Copyright (C) 2011 Stefan Diewald (stefan.diewald@mytum.de)
+ * Copyright (C) 2012 Anton Novikov (random.plant@gmail.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/time.h>
+#include <linux/module.h>
+#include <linux/videodev2.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/capability.h>
+#include <linux/eventpoll.h>
+#include <media/v4l2-ioctl.h>
+#include <media/v4l2-common.h>
+#include <media/v4l2-device.h>
+#include <media/v4l2-ctrls.h>
+#include <media/v4l2-event.h>
+
+#include <linux/miscdevice.h>
+#include "v4l2loopback.h"
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0)
+#error This module is not supported on kernels before 4.0.0.
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0)
+#define strscpy strlcpy
+#endif
+
+#if defined(timer_setup) && defined(from_timer)
+#define HAVE_TIMER_SETUP
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0)
+#define VFL_TYPE_VIDEO VFL_TYPE_GRABBER
+#endif
+
+#define V4L2LOOPBACK_VERSION_CODE                                              \
+	KERNEL_VERSION(V4L2LOOPBACK_VERSION_MAJOR, V4L2LOOPBACK_VERSION_MINOR, \
+		       V4L2LOOPBACK_VERSION_BUGFIX)
+
+MODULE_DESCRIPTION("V4L2 loopback video device");
+MODULE_AUTHOR("Vasily Levin, "
+	      "IOhannes m zmoelnig <zmoelnig@iem.at>,"
+	      "Stefan Diewald,"
+	      "Anton Novikov"
+	      "et al.");
+#ifdef SNAPSHOT_VERSION
+MODULE_VERSION(__stringify(SNAPSHOT_VERSION));
+#else
+MODULE_VERSION("" __stringify(V4L2LOOPBACK_VERSION_MAJOR) "." __stringify(
+	V4L2LOOPBACK_VERSION_MINOR) "." __stringify(V4L2LOOPBACK_VERSION_BUGFIX));
+#endif
+MODULE_LICENSE("GPL");
+
+/*
+ * helpers
+ */
+#define dprintk(fmt, args...)                                          \
+	do {                                                           \
+		if (debug > 0) {                                       \
+			printk(KERN_INFO "v4l2-loopback[" __stringify( \
+				       __LINE__) "], pid(%d):  " fmt,  \
+			       task_pid_nr(current), ##args);          \
+		}                                                      \
+	} while (0)
+
+#define MARK()                                                             \
+	do {                                                               \
+		if (debug > 1) {                                           \
+			printk(KERN_INFO "%s:%d[%s], pid(%d)\n", __FILE__, \
+			       __LINE__, __func__, task_pid_nr(current));  \
+		}                                                          \
+	} while (0)
+
+#define dprintkrw(fmt, args...)                                        \
+	do {                                                           \
+		if (debug > 2) {                                       \
+			printk(KERN_INFO "v4l2-loopback[" __stringify( \
+				       __LINE__) "], pid(%d): " fmt,   \
+			       task_pid_nr(current), ##args);          \
+		}                                                      \
+	} while (0)
+
+static inline void v4l2l_get_timestamp(struct v4l2_buffer *b)
+{
+	struct timespec64 ts;
+	ktime_get_ts64(&ts);
+
+	b->timestamp.tv_sec = ts.tv_sec;
+	b->timestamp.tv_usec = (ts.tv_nsec / NSEC_PER_USEC);
+	b->flags |= V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
+}
+
+#if BITS_PER_LONG == 32
+#include <asm/div64.h> /* do_div() for 64bit division */
+static inline int v4l2l_mod64(const s64 A, const u32 B)
+{
+	u64 a = (u64)A;
+	u32 b = B;
+
+	if (A > 0)
+		return do_div(a, b);
+	a = -A;
+	return -do_div(a, b);
+}
+#else
+static inline int v4l2l_mod64(const s64 A, const u32 B)
+{
+	return A % B;
+}
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0)
+typedef unsigned __poll_t;
+#endif
+
+/* module constants
+ *  can be overridden during he build process using something like
+ *	make KCPPFLAGS="-DMAX_DEVICES=100"
+ */
+
+/* maximum number of v4l2loopback devices that can be created */
+#ifndef MAX_DEVICES
+#define MAX_DEVICES 8
+#endif
+
+/* whether the default is to announce capabilities exclusively or not */
+#ifndef V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS
+#define V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS 0
+#endif
+
+/* when a producer is considered to have gone stale */
+#ifndef MAX_TIMEOUT
+#define MAX_TIMEOUT (100 * 1000) /* in msecs */
+#endif
+
+/* max buffers that can be mapped, actually they
+ * are all mapped to max_buffers buffers */
+#ifndef MAX_BUFFERS
+#define MAX_BUFFERS 32
+#endif
+
+/* module parameters */
+static int debug = 0;
+module_param(debug, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug, "debugging level (higher values == more verbose)");
+
+#define V4L2LOOPBACK_DEFAULT_MAX_BUFFERS 2
+static int max_buffers = V4L2LOOPBACK_DEFAULT_MAX_BUFFERS;
+module_param(max_buffers, int, S_IRUGO);
+MODULE_PARM_DESC(max_buffers,
+		 "how many buffers should be allocated [DEFAULT: " __stringify(
+			 V4L2LOOPBACK_DEFAULT_MAX_BUFFERS) "]");
+
+/* how many times a device can be opened
+ * the per-module default value can be overridden on a per-device basis using
+ * the /sys/devices interface
+ *
+ * note that max_openers should be at least 2 in order to get a working system:
+ *   one opener for the producer and one opener for the consumer
+ *   however, we leave that to the user
+ */
+#define V4L2LOOPBACK_DEFAULT_MAX_OPENERS 10
+static int max_openers = V4L2LOOPBACK_DEFAULT_MAX_OPENERS;
+module_param(max_openers, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(
+	max_openers,
+	"how many users can open the loopback device [DEFAULT: " __stringify(
+		V4L2LOOPBACK_DEFAULT_MAX_OPENERS) "]");
+
+static int devices = -1;
+module_param(devices, int, 0);
+MODULE_PARM_DESC(devices, "how many devices should be created");
+
+static int video_nr[MAX_DEVICES] = { [0 ...(MAX_DEVICES - 1)] = -1 };
+module_param_array(video_nr, int, NULL, 0444);
+MODULE_PARM_DESC(video_nr,
+		 "video device numbers (-1=auto, 0=/dev/video0, etc.)");
+
+static char *card_label[MAX_DEVICES];
+module_param_array(card_label, charp, NULL, 0000);
+MODULE_PARM_DESC(card_label, "card labels for each device");
+
+static bool exclusive_caps[MAX_DEVICES] = {
+	[0 ...(MAX_DEVICES - 1)] = V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS
+};
+module_param_array(exclusive_caps, bool, NULL, 0444);
+/* FIXXME: wording */
+MODULE_PARM_DESC(
+	exclusive_caps,
+	"whether to announce OUTPUT/CAPTURE capabilities exclusively or not  [DEFAULT: " __stringify(
+		V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS) "]");
+
+/* format specifications */
+#define V4L2LOOPBACK_SIZE_MIN_WIDTH 2
+#define V4L2LOOPBACK_SIZE_MIN_HEIGHT 1
+#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH 8192
+#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT 8192
+
+#define V4L2LOOPBACK_SIZE_DEFAULT_WIDTH 640
+#define V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT 480
+
+static int max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH;
+module_param(max_width, int, S_IRUGO);
+MODULE_PARM_DESC(max_width,
+		 "maximum allowed frame width [DEFAULT: " __stringify(
+			 V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH) "]");
+static int max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT;
+module_param(max_height, int, S_IRUGO);
+MODULE_PARM_DESC(max_height,
+		 "maximum allowed frame height [DEFAULT: " __stringify(
+			 V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT) "]");
+
+static DEFINE_IDR(v4l2loopback_index_idr);
+static DEFINE_MUTEX(v4l2loopback_ctl_mutex);
+
+/* frame intervals */
+#define V4L2LOOPBACK_FPS_MIN 0
+#define V4L2LOOPBACK_FPS_MAX 1000
+
+/* control IDs */
+#define V4L2LOOPBACK_CID_BASE (V4L2_CID_USER_BASE | 0xf000)
+#define CID_KEEP_FORMAT (V4L2LOOPBACK_CID_BASE + 0)
+#define CID_SUSTAIN_FRAMERATE (V4L2LOOPBACK_CID_BASE + 1)
+#define CID_TIMEOUT (V4L2LOOPBACK_CID_BASE + 2)
+#define CID_TIMEOUT_IMAGE_IO (V4L2LOOPBACK_CID_BASE + 3)
+
+static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl);
+static const struct v4l2_ctrl_ops v4l2loopback_ctrl_ops = {
+	.s_ctrl = v4l2loopback_s_ctrl,
+};
+static const struct v4l2_ctrl_config v4l2loopback_ctrl_keepformat = {
+	// clang-format off
+	.ops	= &v4l2loopback_ctrl_ops,
+	.id	= CID_KEEP_FORMAT,
+	.name	= "keep_format",
+	.type	= V4L2_CTRL_TYPE_BOOLEAN,
+	.min	= 0,
+	.max	= 1,
+	.step	= 1,
+	.def	= 0,
+	// clang-format on
+};
+static const struct v4l2_ctrl_config v4l2loopback_ctrl_sustainframerate = {
+	// clang-format off
+	.ops	= &v4l2loopback_ctrl_ops,
+	.id	= CID_SUSTAIN_FRAMERATE,
+	.name	= "sustain_framerate",
+	.type	= V4L2_CTRL_TYPE_BOOLEAN,
+	.min	= 0,
+	.max	= 1,
+	.step	= 1,
+	.def	= 0,
+	// clang-format on
+};
+static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeout = {
+	// clang-format off
+	.ops	= &v4l2loopback_ctrl_ops,
+	.id	= CID_TIMEOUT,
+	.name	= "timeout",
+	.type	= V4L2_CTRL_TYPE_INTEGER,
+	.min	= 0,
+	.max	= MAX_TIMEOUT,
+	.step	= 1,
+	.def	= 0,
+	// clang-format on
+};
+static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeoutimageio = {
+	// clang-format off
+	.ops	= &v4l2loopback_ctrl_ops,
+	.id	= CID_TIMEOUT_IMAGE_IO,
+	.name	= "timeout_image_io",
+	.type	= V4L2_CTRL_TYPE_BUTTON,
+	.min	= 0,
+	.max	= 1,
+	.step	= 1,
+	.def	= 0,
+	// clang-format on
+};
+
+/* module structures */
+struct v4l2loopback_private {
+	int device_nr;
+};
+
+/* TODO(vasaka) use typenames which are common to kernel, but first find out if
+ * it is needed */
+/* struct keeping state and settings of loopback device */
+
+struct v4l2l_buffer {
+	struct v4l2_buffer buffer;
+	struct list_head list_head;
+	int use_count;
+};
+
+struct v4l2_loopback_device {
+	struct v4l2_device v4l2_dev;
+	struct v4l2_ctrl_handler ctrl_handler;
+	struct video_device *vdev;
+	/* pixel and stream format */
+	struct v4l2_pix_format pix_format;
+	bool pix_format_has_valid_sizeimage;
+	struct v4l2_captureparm capture_param;
+	unsigned long frame_jiffies;
+
+	/* ctrls */
+	int keep_format; /* CID_KEEP_FORMAT; stay ready_for_capture even when all
+			    openers close() the device */
+	int sustain_framerate; /* CID_SUSTAIN_FRAMERATE; duplicate frames to maintain
+				  (close to) nominal framerate */
+
+	/* buffers stuff */
+	u8 *image; /* pointer to actual buffers data */
+	unsigned long int imagesize; /* size of buffers data */
+	int buffers_number; /* should not be big, 4 is a good choice */
+	struct v4l2l_buffer buffers[MAX_BUFFERS]; /* inner driver buffers */
+	int used_buffers; /* number of the actually used buffers */
+	int max_openers; /* how many times can this device be opened */
+
+	s64 write_position; /* number of last written frame + 1 */
+	struct list_head outbufs_list; /* buffers in output DQBUF order */
+	int bufpos2index
+		[MAX_BUFFERS]; /* mapping of (read/write_position % used_buffers)
+                        * to inner buffer index */
+	long buffer_size;
+
+	/* sustain_framerate stuff */
+	struct timer_list sustain_timer;
+	unsigned int reread_count;
+
+	/* timeout stuff */
+	unsigned long timeout_jiffies; /* CID_TIMEOUT; 0 means disabled */
+	int timeout_image_io; /* CID_TIMEOUT_IMAGE_IO; next opener will
+			       * read/write to timeout_image */
+	u8 *timeout_image; /* copy of it will be captured when timeout passes */
+	struct v4l2l_buffer timeout_image_buffer;
+	struct timer_list timeout_timer;
+	int timeout_happened;
+
+	/* sync stuff */
+	atomic_t open_count;
+
+	int ready_for_capture; /* set to the number of writers that opened the
+                                * device and negotiated format. */
+	int ready_for_output; /* set to true when no writer is currently attached
+			       * this differs slightly from !ready_for_capture,
+			       * e.g. when using fallback images */
+	int active_readers; /* increase if any reader starts streaming */
+	int announce_all_caps; /* set to false, if device caps (OUTPUT/CAPTURE)
+                                * should only be announced if the resp. "ready"
+                                * flag is set; default=TRUE */
+
+	int min_width, max_width;
+	int min_height, max_height;
+
+	char card_label[32];
+
+	wait_queue_head_t read_event;
+	spinlock_t lock, list_lock;
+};
+
+/* types of opener shows what opener wants to do with loopback */
+enum opener_type {
+	// clang-format off
+	UNNEGOTIATED	= 0,
+	READER		= 1,
+	WRITER		= 2,
+	// clang-format on
+};
+
+/* struct keeping state and type of opener */
+struct v4l2_loopback_opener {
+	enum opener_type type;
+	s64 read_position; /* number of last processed frame + 1 or
+			    * write_position - 1 if reader went out of sync */
+	unsigned int reread_count;
+	struct v4l2_buffer *buffers;
+	int buffers_number; /* should not be big, 4 is a good choice */
+	int timeout_image_io;
+
+	struct v4l2_fh fh;
+};
+
+#define fh_to_opener(ptr) container_of((ptr), struct v4l2_loopback_opener, fh)
+
+/* this is heavily inspired by the bttv driver found in the linux kernel */
+struct v4l2l_format {
+	char *name;
+	int fourcc; /* video4linux 2 */
+	int depth; /* bit/pixel */
+	int flags;
+};
+/* set the v4l2l_format.flags to PLANAR for non-packed formats */
+#define FORMAT_FLAGS_PLANAR 0x01
+#define FORMAT_FLAGS_COMPRESSED 0x02
+
+#include "v4l2loopback_formats.h"
+
+#ifndef V4L2_TYPE_IS_CAPTURE
+#define V4L2_TYPE_IS_CAPTURE(type)                \
+	((type) == V4L2_BUF_TYPE_VIDEO_CAPTURE || \
+	 (type) == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE)
+#endif /* V4L2_TYPE_IS_CAPTURE */
+#ifndef V4L2_TYPE_IS_OUTPUT
+#define V4L2_TYPE_IS_OUTPUT(type)                \
+	((type) == V4L2_BUF_TYPE_VIDEO_OUTPUT || \
+	 (type) == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE)
+#endif /* V4L2_TYPE_IS_OUTPUT */
+
+/* whether the format can be changed */
+/* the format is fixated if we
+   - have writers (ready_for_capture>0)
+   - and/or have readers (active_readers>0)
+*/
+#define V4L2LOOPBACK_IS_FIXED_FMT(device)                               \
+	(device->ready_for_capture > 0 || device->active_readers > 0 || \
+	 device->keep_format)
+
+static const unsigned int FORMATS = ARRAY_SIZE(formats);
+
+static char *fourcc2str(unsigned int fourcc, char buf[4])
+{
+	buf[0] = (fourcc >> 0) & 0xFF;
+	buf[1] = (fourcc >> 8) & 0xFF;
+	buf[2] = (fourcc >> 16) & 0xFF;
+	buf[3] = (fourcc >> 24) & 0xFF;
+
+	return buf;
+}
+
+static const struct v4l2l_format *format_by_fourcc(int fourcc)
+{
+	unsigned int i;
+
+	for (i = 0; i < FORMATS; i++) {
+		if (formats[i].fourcc == fourcc)
+			return formats + i;
+	}
+
+	dprintk("unsupported format '%c%c%c%c'\n", (fourcc >> 0) & 0xFF,
+		(fourcc >> 8) & 0xFF, (fourcc >> 16) & 0xFF,
+		(fourcc >> 24) & 0xFF);
+	return NULL;
+}
+
+static void pix_format_set_size(struct v4l2_pix_format *f,
+				const struct v4l2l_format *fmt,
+				unsigned int width, unsigned int height)
+{
+	f->width = width;
+	f->height = height;
+
+	if (fmt->flags & FORMAT_FLAGS_PLANAR) {
+		f->bytesperline = width; /* Y plane */
+		f->sizeimage = (width * height * fmt->depth) >> 3;
+	} else if (fmt->flags & FORMAT_FLAGS_COMPRESSED) {
+		/* doesn't make sense for compressed formats */
+		f->bytesperline = 0;
+		f->sizeimage = (width * height * fmt->depth) >> 3;
+	} else {
+		f->bytesperline = (width * fmt->depth) >> 3;
+		f->sizeimage = height * f->bytesperline;
+	}
+}
+
+static int v4l2l_fill_format(struct v4l2_format *fmt, int capture,
+			     const u32 minwidth, const u32 maxwidth,
+			     const u32 minheight, const u32 maxheight)
+{
+	u32 width = fmt->fmt.pix.width, height = fmt->fmt.pix.height;
+	u32 pixelformat = fmt->fmt.pix.pixelformat;
+	struct v4l2_format fmt0 = *fmt;
+	u32 bytesperline = 0, sizeimage = 0;
+	if (!width)
+		width = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH;
+	if (!height)
+		height = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT;
+	if (width < minwidth)
+		width = minwidth;
+	if (width > maxwidth)
+		width = maxwidth;
+	if (height < minheight)
+		height = minheight;
+	if (height > maxheight)
+		height = maxheight;
+
+	/* sets: width,height,pixelformat,bytesperline,sizeimage */
+	if (!(V4L2_TYPE_IS_MULTIPLANAR(fmt0.type))) {
+		fmt0.fmt.pix.bytesperline = 0;
+		fmt0.fmt.pix.sizeimage = 0;
+	}
+
+	if (0) {
+		;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0)
+	} else if (!v4l2_fill_pixfmt(&fmt0.fmt.pix, pixelformat, width,
+				     height)) {
+		;
+	} else if (!v4l2_fill_pixfmt_mp(&fmt0.fmt.pix_mp, pixelformat, width,
+					height)) {
+		;
+#endif
+	} else {
+		const struct v4l2l_format *format =
+			format_by_fourcc(pixelformat);
+		if (!format)
+			return -EINVAL;
+		pix_format_set_size(&fmt0.fmt.pix, format, width, height);
+		fmt0.fmt.pix.pixelformat = format->fourcc;
+	}
+
+	if (V4L2_TYPE_IS_MULTIPLANAR(fmt0.type)) {
+		*fmt = fmt0;
+
+		if ((fmt->fmt.pix_mp.colorspace == V4L2_COLORSPACE_DEFAULT) ||
+		    (fmt->fmt.pix_mp.colorspace > V4L2_COLORSPACE_DCI_P3))
+			fmt->fmt.pix_mp.colorspace = V4L2_COLORSPACE_SRGB;
+		if (V4L2_FIELD_ANY == fmt->fmt.pix_mp.field)
+			fmt->fmt.pix_mp.field = V4L2_FIELD_NONE;
+		if (capture)
+			fmt->type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
+		else
+			fmt->type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
+	} else {
+		bytesperline = fmt->fmt.pix.bytesperline;
+		sizeimage = fmt->fmt.pix.sizeimage;
+
+		*fmt = fmt0;
+
+		if (!fmt->fmt.pix.bytesperline)
+			fmt->fmt.pix.bytesperline = bytesperline;
+		if (!fmt->fmt.pix.sizeimage)
+			fmt->fmt.pix.sizeimage = sizeimage;
+
+		if ((fmt->fmt.pix.colorspace == V4L2_COLORSPACE_DEFAULT) ||
+		    (fmt->fmt.pix.colorspace > V4L2_COLORSPACE_DCI_P3))
+			fmt->fmt.pix.colorspace = V4L2_COLORSPACE_SRGB;
+		if (V4L2_FIELD_ANY == fmt->fmt.pix.field)
+			fmt->fmt.pix.field = V4L2_FIELD_NONE;
+		if (capture)
+			fmt->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+		else
+			fmt->type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
+	}
+
+	return 0;
+}
+
+/* Checks if v4l2l_fill_format() has set a valid, fixed sizeimage val. */
+static bool v4l2l_pix_format_has_valid_sizeimage(struct v4l2_format *fmt)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0)
+	const struct v4l2_format_info *info;
+
+	info = v4l2_format_info(fmt->fmt.pix.pixelformat);
+	if (info && info->mem_planes == 1)
+		return true;
+#endif
+
+	return false;
+}
+
+static int pix_format_eq(const struct v4l2_pix_format *ref,
+			 const struct v4l2_pix_format *tgt, int strict)
+{
+	/* check if the two formats are equivalent.
+	 * ANY fields are handled gracefully
+	 */
+#define _pix_format_eq0(x)    \
+	if (ref->x != tgt->x) \
+	result = 0
+#define _pix_format_eq1(x, def)                              \
+	do {                                                 \
+		if ((def != tgt->x) && (ref->x != tgt->x)) { \
+			printk(KERN_INFO #x " failed");      \
+			result = 0;                          \
+		}                                            \
+	} while (0)
+	int result = 1;
+	_pix_format_eq0(width);
+	_pix_format_eq0(height);
+	_pix_format_eq0(pixelformat);
+	if (!strict)
+		return result;
+	_pix_format_eq1(field, V4L2_FIELD_ANY);
+	_pix_format_eq0(bytesperline);
+	_pix_format_eq0(sizeimage);
+	_pix_format_eq1(colorspace, V4L2_COLORSPACE_DEFAULT);
+	return result;
+}
+
+static struct v4l2_loopback_device *v4l2loopback_getdevice(struct file *f);
+static int inner_try_setfmt(struct file *file, struct v4l2_format *fmt)
+{
+	int capture = V4L2_TYPE_IS_CAPTURE(fmt->type);
+	struct v4l2_loopback_device *dev;
+	int needschange = 0;
+	char buf[5];
+	buf[4] = 0;
+
+	dev = v4l2loopback_getdevice(file);
+
+	needschange = !(pix_format_eq(&dev->pix_format, &fmt->fmt.pix, 0));
+	if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) {
+		fmt->fmt.pix = dev->pix_format;
+		if (needschange) {
+			if (dev->active_readers > 0 && capture) {
+				/* cannot call fmt_cap while there are readers */
+				return -EBUSY;
+			}
+			if (dev->ready_for_capture > 0 && !capture) {
+				/* cannot call fmt_out while there are writers */
+				return -EBUSY;
+			}
+		}
+	}
+	if (v4l2l_fill_format(fmt, capture, dev->min_width, dev->max_width,
+			      dev->min_height, dev->max_height) != 0) {
+		return -EINVAL;
+	}
+
+	if (1) {
+		char buf[5];
+		buf[4] = 0;
+		dprintk("capFOURCC=%s\n",
+			fourcc2str(dev->pix_format.pixelformat, buf));
+	}
+	return 0;
+}
+
+static int set_timeperframe(struct v4l2_loopback_device *dev,
+			    struct v4l2_fract *tpf)
+{
+	if ((tpf->denominator < 1) || (tpf->numerator < 1)) {
+		return -EINVAL;
+	}
+	dev->capture_param.timeperframe = *tpf;
+	dev->frame_jiffies = max(1UL, msecs_to_jiffies(1000) * tpf->numerator /
+					      tpf->denominator);
+	return 0;
+}
+
+static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd);
+
+/* device attributes */
+/* available via sysfs: /sys/devices/virtual/video4linux/video* */
+
+static ssize_t attr_show_format(struct device *cd,
+				struct device_attribute *attr, char *buf)
+{
+	/* gets the current format as "FOURCC:WxH@f/s", e.g. "YUYV:320x240@1000/30" */
+	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
+	const struct v4l2_fract *tpf;
+	char buf4cc[5], buf_fps[32];
+
+	if (!dev || !V4L2LOOPBACK_IS_FIXED_FMT(dev))
+		return 0;
+	tpf = &dev->capture_param.timeperframe;
+
+	fourcc2str(dev->pix_format.pixelformat, buf4cc);
+	buf4cc[4] = 0;
+	if (tpf->numerator == 1)
+		snprintf(buf_fps, sizeof(buf_fps), "%d", tpf->denominator);
+	else
+		snprintf(buf_fps, sizeof(buf_fps), "%d/%d", tpf->denominator,
+			 tpf->numerator);
+	return sprintf(buf, "%4s:%dx%d@%s\n", buf4cc, dev->pix_format.width,
+		       dev->pix_format.height, buf_fps);
+}
+
+static ssize_t attr_store_format(struct device *cd,
+				 struct device_attribute *attr, const char *buf,
+				 size_t len)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
+	int fps_num = 0, fps_den = 1;
+
+	if (!dev)
+		return -ENODEV;
+
+	/* only fps changing is supported */
+	if (sscanf(buf, "@%d/%d", &fps_num, &fps_den) > 0) {
+		struct v4l2_fract f = { .numerator = fps_den,
+					.denominator = fps_num };
+		int err = 0;
+		if ((err = set_timeperframe(dev, &f)) < 0)
+			return err;
+		return len;
+	}
+	return -EINVAL;
+}
+
+static DEVICE_ATTR(format, S_IRUGO | S_IWUSR, attr_show_format,
+		   attr_store_format);
+
+static ssize_t attr_show_buffers(struct device *cd,
+				 struct device_attribute *attr, char *buf)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "%d\n", dev->used_buffers);
+}
+
+static DEVICE_ATTR(buffers, S_IRUGO, attr_show_buffers, NULL);
+
+static ssize_t attr_show_maxopeners(struct device *cd,
+				    struct device_attribute *attr, char *buf)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "%d\n", dev->max_openers);
+}
+
+static ssize_t attr_store_maxopeners(struct device *cd,
+				     struct device_attribute *attr,
+				     const char *buf, size_t len)
+{
+	struct v4l2_loopback_device *dev = NULL;
+	unsigned long curr = 0;
+
+	if (kstrtoul(buf, 0, &curr))
+		return -EINVAL;
+
+	dev = v4l2loopback_cd2dev(cd);
+	if (!dev)
+		return -ENODEV;
+
+	if (dev->max_openers == curr)
+		return len;
+
+	if (curr > __INT_MAX__ || dev->open_count.counter > curr) {
+		/* request to limit to less openers as are currently attached to us */
+		return -EINVAL;
+	}
+
+	dev->max_openers = (int)curr;
+
+	return len;
+}
+
+static DEVICE_ATTR(max_openers, S_IRUGO | S_IWUSR, attr_show_maxopeners,
+		   attr_store_maxopeners);
+
+static ssize_t attr_show_state(struct device *cd, struct device_attribute *attr,
+			       char *buf)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
+
+	if (!dev)
+		return -ENODEV;
+
+	if (dev->ready_for_capture)
+		return sprintf(buf, "capture\n");
+	if (dev->ready_for_output)
+		return sprintf(buf, "output\n");
+
+	return -EAGAIN;
+}
+
+static DEVICE_ATTR(state, S_IRUGO, attr_show_state, NULL);
+
+static void v4l2loopback_remove_sysfs(struct video_device *vdev)
+{
+#define V4L2_SYSFS_DESTROY(x) device_remove_file(&vdev->dev, &dev_attr_##x)
+
+	if (vdev) {
+		V4L2_SYSFS_DESTROY(format);
+		V4L2_SYSFS_DESTROY(buffers);
+		V4L2_SYSFS_DESTROY(max_openers);
+		V4L2_SYSFS_DESTROY(state);
+		/* ... */
+	}
+}
+
+static void v4l2loopback_create_sysfs(struct video_device *vdev)
+{
+	int res = 0;
+
+#define V4L2_SYSFS_CREATE(x)                                 \
+	res = device_create_file(&vdev->dev, &dev_attr_##x); \
+	if (res < 0)                                         \
+	break
+	if (!vdev)
+		return;
+	do {
+		V4L2_SYSFS_CREATE(format);
+		V4L2_SYSFS_CREATE(buffers);
+		V4L2_SYSFS_CREATE(max_openers);
+		V4L2_SYSFS_CREATE(state);
+		/* ... */
+	} while (0);
+
+	if (res >= 0)
+		return;
+	dev_err(&vdev->dev, "%s error: %d\n", __func__, res);
+}
+
+/* Event APIs */
+
+#define V4L2LOOPBACK_EVENT_BASE (V4L2_EVENT_PRIVATE_START)
+#define V4L2LOOPBACK_EVENT_OFFSET 0x08E00000
+#define V4L2_EVENT_PRI_CLIENT_USAGE \
+	(V4L2LOOPBACK_EVENT_BASE + V4L2LOOPBACK_EVENT_OFFSET + 1)
+
+struct v4l2_event_client_usage {
+	__u32 count;
+};
+
+/* global module data */
+/* find a device based on it's device-number (e.g. '3' for /dev/video3) */
+struct v4l2loopback_lookup_cb_data {
+	int device_nr;
+	struct v4l2_loopback_device *device;
+};
+static int v4l2loopback_lookup_cb(int id, void *ptr, void *data)
+{
+	struct v4l2_loopback_device *device = ptr;
+	struct v4l2loopback_lookup_cb_data *cbdata = data;
+	if (cbdata && device && device->vdev) {
+		if (device->vdev->num == cbdata->device_nr) {
+			cbdata->device = device;
+			cbdata->device_nr = id;
+			return 1;
+		}
+	}
+	return 0;
+}
+static int v4l2loopback_lookup(int device_nr,
+			       struct v4l2_loopback_device **device)
+{
+	struct v4l2loopback_lookup_cb_data data = {
+		.device_nr = device_nr,
+		.device = NULL,
+	};
+	int err = idr_for_each(&v4l2loopback_index_idr, &v4l2loopback_lookup_cb,
+			       &data);
+	if (1 == err) {
+		if (device)
+			*device = data.device;
+		return data.device_nr;
+	}
+	return -ENODEV;
+}
+static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd)
+{
+	struct video_device *loopdev = to_video_device(cd);
+	struct v4l2loopback_private *ptr =
+		(struct v4l2loopback_private *)video_get_drvdata(loopdev);
+	int nr = ptr->device_nr;
+
+	return idr_find(&v4l2loopback_index_idr, nr);
+}
+
+static struct v4l2_loopback_device *v4l2loopback_getdevice(struct file *f)
+{
+	struct v4l2loopback_private *ptr = video_drvdata(f);
+	int nr = ptr->device_nr;
+
+	return idr_find(&v4l2loopback_index_idr, nr);
+}
+
+/* forward declarations */
+static void client_usage_queue_event(struct video_device *vdev);
+static void init_buffers(struct v4l2_loopback_device *dev);
+static int allocate_buffers(struct v4l2_loopback_device *dev);
+static void free_buffers(struct v4l2_loopback_device *dev);
+static void try_free_buffers(struct v4l2_loopback_device *dev);
+static int allocate_timeout_image(struct v4l2_loopback_device *dev);
+static void check_timers(struct v4l2_loopback_device *dev);
+static const struct v4l2_file_operations v4l2_loopback_fops;
+static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops;
+
+/* Queue helpers */
+/* next functions sets buffer flags and adjusts counters accordingly */
+static inline void set_done(struct v4l2l_buffer *buffer)
+{
+	buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED;
+	buffer->buffer.flags |= V4L2_BUF_FLAG_DONE;
+}
+
+static inline void set_queued(struct v4l2l_buffer *buffer)
+{
+	buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE;
+	buffer->buffer.flags |= V4L2_BUF_FLAG_QUEUED;
+}
+
+static inline void unset_flags(struct v4l2l_buffer *buffer)
+{
+	buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED;
+	buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE;
+}
+
+/* V4L2 ioctl caps and params calls */
+/* returns device capabilities
+ * called on VIDIOC_QUERYCAP
+ */
+static int vidioc_querycap(struct file *file, void *priv,
+			   struct v4l2_capability *cap)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	int device_nr =
+		((struct v4l2loopback_private *)video_get_drvdata(dev->vdev))
+			->device_nr;
+	__u32 capabilities = V4L2_CAP_STREAMING | V4L2_CAP_READWRITE;
+
+	strscpy(cap->driver, "v4l2 loopback", sizeof(cap->driver));
+	snprintf(cap->card, sizeof(cap->card), "%s", dev->card_label);
+	snprintf(cap->bus_info, sizeof(cap->bus_info),
+		 "platform:v4l2loopback-%03d", device_nr);
+
+	if (dev->announce_all_caps) {
+		capabilities |= V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT;
+	} else {
+		if (dev->ready_for_capture) {
+			capabilities |= V4L2_CAP_VIDEO_CAPTURE;
+		}
+		if (dev->ready_for_output) {
+			capabilities |= V4L2_CAP_VIDEO_OUTPUT;
+		}
+	}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
+	dev->vdev->device_caps =
+#endif /* >=linux-4.7.0 */
+		cap->device_caps = cap->capabilities = capabilities;
+
+	cap->capabilities |= V4L2_CAP_DEVICE_CAPS;
+
+	memset(cap->reserved, 0, sizeof(cap->reserved));
+	return 0;
+}
+
+static int vidioc_enum_framesizes(struct file *file, void *fh,
+				  struct v4l2_frmsizeenum *argp)
+{
+	struct v4l2_loopback_device *dev;
+
+	/* there can be only one... */
+	if (argp->index)
+		return -EINVAL;
+
+	dev = v4l2loopback_getdevice(file);
+	if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) {
+		/* format has already been negotiated
+		 * cannot change during runtime
+		 */
+		if (argp->pixel_format != dev->pix_format.pixelformat)
+			return -EINVAL;
+
+		argp->type = V4L2_FRMSIZE_TYPE_DISCRETE;
+
+		argp->discrete.width = dev->pix_format.width;
+		argp->discrete.height = dev->pix_format.height;
+	} else {
+		/* if the format has not been negotiated yet, we accept anything
+		 */
+		if (NULL == format_by_fourcc(argp->pixel_format))
+			return -EINVAL;
+
+		if (dev->min_width == dev->max_width &&
+		    dev->min_height == dev->max_height) {
+			argp->type = V4L2_FRMSIZE_TYPE_DISCRETE;
+
+			argp->discrete.width = dev->min_width;
+			argp->discrete.height = dev->min_height;
+		} else {
+			argp->type = V4L2_FRMSIZE_TYPE_CONTINUOUS;
+
+			argp->stepwise.min_width = dev->min_width;
+			argp->stepwise.min_height = dev->min_height;
+
+			argp->stepwise.max_width = dev->max_width;
+			argp->stepwise.max_height = dev->max_height;
+
+			argp->stepwise.step_width = 1;
+			argp->stepwise.step_height = 1;
+		}
+	}
+	return 0;
+}
+
+/* returns frameinterval (fps) for the set resolution
+ * called on VIDIOC_ENUM_FRAMEINTERVALS
+ */
+static int vidioc_enum_frameintervals(struct file *file, void *fh,
+				      struct v4l2_frmivalenum *argp)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+
+	/* there can be only one... */
+	if (argp->index)
+		return -EINVAL;
+
+	if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) {
+		if (argp->width != dev->pix_format.width ||
+		    argp->height != dev->pix_format.height ||
+		    argp->pixel_format != dev->pix_format.pixelformat)
+			return -EINVAL;
+
+		argp->type = V4L2_FRMIVAL_TYPE_DISCRETE;
+		argp->discrete = dev->capture_param.timeperframe;
+	} else {
+		if (argp->width < dev->min_width ||
+		    argp->width > dev->max_width ||
+		    argp->height < dev->min_height ||
+		    argp->height > dev->max_height ||
+		    NULL == format_by_fourcc(argp->pixel_format))
+			return -EINVAL;
+
+		argp->type = V4L2_FRMIVAL_TYPE_CONTINUOUS;
+		argp->stepwise.min.numerator = 1;
+		argp->stepwise.min.denominator = V4L2LOOPBACK_FPS_MAX;
+		argp->stepwise.max.numerator = 1;
+		argp->stepwise.max.denominator = V4L2LOOPBACK_FPS_MIN;
+		argp->stepwise.step.numerator = 1;
+		argp->stepwise.step.denominator = 1;
+	}
+
+	return 0;
+}
+
+/* ------------------ CAPTURE ----------------------- */
+
+/* returns device formats
+ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
+ */
+static int vidioc_enum_fmt_cap(struct file *file, void *fh,
+			       struct v4l2_fmtdesc *f)
+{
+	struct v4l2_loopback_device *dev;
+	const struct v4l2l_format *fmt;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+
+	if (f->index)
+		return -EINVAL;
+
+	if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) {
+		/* format has been fixed, so only one single format is supported */
+		const __u32 format = dev->pix_format.pixelformat;
+
+		if ((fmt = format_by_fourcc(format))) {
+			snprintf(f->description, sizeof(f->description), "%s",
+				 fmt->name);
+		} else {
+			snprintf(f->description, sizeof(f->description),
+				 "[%c%c%c%c]", (format >> 0) & 0xFF,
+				 (format >> 8) & 0xFF, (format >> 16) & 0xFF,
+				 (format >> 24) & 0xFF);
+		}
+
+		f->pixelformat = dev->pix_format.pixelformat;
+	} else {
+		return -EINVAL;
+	}
+	f->flags = 0;
+	MARK();
+	return 0;
+}
+
+/* returns current video format
+ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
+ */
+static int vidioc_g_fmt_cap(struct file *file, void *priv,
+			    struct v4l2_format *fmt)
+{
+	struct v4l2_loopback_device *dev;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+	if (!dev->ready_for_capture && !dev->ready_for_output)
+		return -EINVAL;
+
+	fmt->fmt.pix = dev->pix_format;
+	MARK();
+	return 0;
+}
+
+/* checks if it is OK to change to format fmt;
+ * actual check is done by inner_try_setfmt
+ * just checking that pixelformat is OK and set other parameters, app should
+ * obey this decision
+ * called on VIDIOC_TRY_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
+ */
+static int vidioc_try_fmt_cap(struct file *file, void *priv,
+			      struct v4l2_format *fmt)
+{
+	int ret = 0;
+	if (!V4L2_TYPE_IS_CAPTURE(fmt->type))
+		return -EINVAL;
+	ret = inner_try_setfmt(file, fmt);
+	if (-EBUSY == ret)
+		return 0;
+	return ret;
+}
+
+/* sets new output format, if possible
+ * actually format is set  by input and we even do not check it, just return
+ * current one, but it is possible to set subregions of input TODO(vasaka)
+ * called on VIDIOC_S_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
+ */
+static int vidioc_s_fmt_cap(struct file *file, void *priv,
+			    struct v4l2_format *fmt)
+{
+	int ret;
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	if (!V4L2_TYPE_IS_CAPTURE(fmt->type))
+		return -EINVAL;
+	ret = inner_try_setfmt(file, fmt);
+	if (!ret) {
+		dev->pix_format = fmt->fmt.pix;
+	}
+	return ret;
+}
+
+/* ------------------ OUTPUT ----------------------- */
+
+/* returns device formats;
+ * LATER: allow all formats
+ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
+ */
+static int vidioc_enum_fmt_out(struct file *file, void *fh,
+			       struct v4l2_fmtdesc *f)
+{
+	struct v4l2_loopback_device *dev;
+	const struct v4l2l_format *fmt;
+
+	dev = v4l2loopback_getdevice(file);
+
+	if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) {
+		/* format has been fixed, so only one single format is supported */
+		const __u32 format = dev->pix_format.pixelformat;
+
+		if (f->index)
+			return -EINVAL;
+
+		if ((fmt = format_by_fourcc(format))) {
+			snprintf(f->description, sizeof(f->description), "%s",
+				 fmt->name);
+		} else {
+			snprintf(f->description, sizeof(f->description),
+				 "[%c%c%c%c]", (format >> 0) & 0xFF,
+				 (format >> 8) & 0xFF, (format >> 16) & 0xFF,
+				 (format >> 24) & 0xFF);
+		}
+
+		f->pixelformat = dev->pix_format.pixelformat;
+	} else {
+		/* fill in a dummy format */
+		/* coverity[unsigned_compare] */
+		if (f->index < 0 || f->index >= FORMATS)
+			return -EINVAL;
+
+		fmt = &formats[f->index];
+
+		f->pixelformat = fmt->fourcc;
+		snprintf(f->description, sizeof(f->description), "%s",
+			 fmt->name);
+	}
+	f->flags = 0;
+
+	return 0;
+}
+
+/* returns current video format format fmt */
+/* NOTE: this is called from the producer
+ * so if format has not been negotiated yet,
+ * it should return ALL of available formats,
+ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
+ */
+static int vidioc_g_fmt_out(struct file *file, void *priv,
+			    struct v4l2_format *fmt)
+{
+	struct v4l2_loopback_device *dev;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+
+	/*
+	 * LATER: this should return the currently valid format
+	 * gstreamer doesn't like it, if this returns -EINVAL, as it
+	 * then concludes that there is _no_ valid format
+	 * CHECK whether this assumption is wrong,
+	 * or whether we have to always provide a valid format
+	 */
+
+	fmt->fmt.pix = dev->pix_format;
+	return 0;
+}
+
+/* checks if it is OK to change to format fmt;
+ * if format is negotiated do not change it
+ * called on VIDIOC_TRY_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
+ */
+static int vidioc_try_fmt_out(struct file *file, void *priv,
+			      struct v4l2_format *fmt)
+{
+	int ret = 0;
+	if (!V4L2_TYPE_IS_OUTPUT(fmt->type))
+		return -EINVAL;
+	ret = inner_try_setfmt(file, fmt);
+	if (-EBUSY == ret)
+		return 0;
+	return ret;
+}
+
+/* sets new output format, if possible;
+ * allocate data here because we do not know if it will be streaming or
+ * read/write IO
+ * called on VIDIOC_S_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
+ */
+static int vidioc_s_fmt_out(struct file *file, void *priv,
+			    struct v4l2_format *fmt)
+{
+	struct v4l2_loopback_device *dev;
+	int ret;
+	char buf[5];
+	buf[4] = 0;
+	if (!V4L2_TYPE_IS_OUTPUT(fmt->type))
+		return -EINVAL;
+	dev = v4l2loopback_getdevice(file);
+
+	ret = inner_try_setfmt(file, fmt);
+	if (!ret) {
+		dev->pix_format = fmt->fmt.pix;
+		dev->pix_format_has_valid_sizeimage =
+			v4l2l_pix_format_has_valid_sizeimage(fmt);
+		dprintk("s_fmt_out(%d) %d...%d\n", ret, dev->ready_for_capture,
+			dev->pix_format.sizeimage);
+		dprintk("outFOURCC=%s\n",
+			fourcc2str(dev->pix_format.pixelformat, buf));
+
+		if (!dev->ready_for_capture) {
+			dev->buffer_size =
+				PAGE_ALIGN(dev->pix_format.sizeimage);
+			// JMZ: TODO get rid of the next line
+			fmt->fmt.pix.sizeimage = dev->buffer_size;
+			ret = allocate_buffers(dev);
+		}
+	}
+	return ret;
+}
+
+// #define V4L2L_OVERLAY
+#ifdef V4L2L_OVERLAY
+/* ------------------ OVERLAY ----------------------- */
+/* currently unsupported */
+/* GSTreamer's v4l2sink is buggy, as it requires the overlay to work
+ * while it should only require it, if overlay is requested
+ * once the gstreamer element is fixed, remove the overlay dummies
+ */
+#warning OVERLAY dummies
+static int vidioc_g_fmt_overlay(struct file *file, void *priv,
+				struct v4l2_format *fmt)
+{
+	return 0;
+}
+
+static int vidioc_s_fmt_overlay(struct file *file, void *priv,
+				struct v4l2_format *fmt)
+{
+	return 0;
+}
+#endif /* V4L2L_OVERLAY */
+
+/* ------------------ PARAMs ----------------------- */
+
+/* get some data flow parameters, only capability, fps and readbuffers has
+ * effect on this driver
+ * called on VIDIOC_G_PARM
+ */
+static int vidioc_g_parm(struct file *file, void *priv,
+			 struct v4l2_streamparm *parm)
+{
+	/* do not care about type of opener, hope these enums would always be
+	 * compatible */
+	struct v4l2_loopback_device *dev;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+	parm->parm.capture = dev->capture_param;
+	return 0;
+}
+
+/* get some data flow parameters, only capability, fps and readbuffers has
+ * effect on this driver
+ * called on VIDIOC_S_PARM
+ */
+static int vidioc_s_parm(struct file *file, void *priv,
+			 struct v4l2_streamparm *parm)
+{
+	struct v4l2_loopback_device *dev;
+	int err = 0;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+	dprintk("vidioc_s_parm called frate=%d/%d\n",
+		parm->parm.capture.timeperframe.numerator,
+		parm->parm.capture.timeperframe.denominator);
+
+	switch (parm->type) {
+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+		if ((err = set_timeperframe(
+			     dev, &parm->parm.capture.timeperframe)) < 0)
+			return err;
+		break;
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+		if ((err = set_timeperframe(
+			     dev, &parm->parm.capture.timeperframe)) < 0)
+			return err;
+		break;
+	default:
+		return -1;
+	}
+
+	parm->parm.capture = dev->capture_param;
+	return 0;
+}
+
+#ifdef V4L2LOOPBACK_WITH_STD
+/* sets a tv standard, actually we do not need to handle this any special way
+ * added to support effecttv
+ * called on VIDIOC_S_STD
+ */
+static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id *_std)
+{
+	v4l2_std_id req_std = 0, supported_std = 0;
+	const v4l2_std_id all_std = V4L2_STD_ALL, no_std = 0;
+
+	if (_std) {
+		req_std = *_std;
+		*_std = all_std;
+	}
+
+	/* we support everything in V4L2_STD_ALL, but not more... */
+	supported_std = (all_std & req_std);
+	if (no_std == supported_std)
+		return -EINVAL;
+
+	return 0;
+}
+
+/* gets a fake video standard
+ * called on VIDIOC_G_STD
+ */
+static int vidioc_g_std(struct file *file, void *fh, v4l2_std_id *norm)
+{
+	if (norm)
+		*norm = V4L2_STD_ALL;
+	return 0;
+}
+/* gets a fake video standard
+ * called on VIDIOC_QUERYSTD
+ */
+static int vidioc_querystd(struct file *file, void *fh, v4l2_std_id *norm)
+{
+	if (norm)
+		*norm = V4L2_STD_ALL;
+	return 0;
+}
+#endif /* V4L2LOOPBACK_WITH_STD */
+
+static int v4l2loopback_set_ctrl(struct v4l2_loopback_device *dev, u32 id,
+				 s64 val)
+{
+	switch (id) {
+	case CID_KEEP_FORMAT:
+		if (val < 0 || val > 1)
+			return -EINVAL;
+		dev->keep_format = val;
+		try_free_buffers(
+			dev); /* will only free buffers if !keep_format */
+		break;
+	case CID_SUSTAIN_FRAMERATE:
+		if (val < 0 || val > 1)
+			return -EINVAL;
+		spin_lock_bh(&dev->lock);
+		dev->sustain_framerate = val;
+		check_timers(dev);
+		spin_unlock_bh(&dev->lock);
+		break;
+	case CID_TIMEOUT:
+		if (val < 0 || val > MAX_TIMEOUT)
+			return -EINVAL;
+		spin_lock_bh(&dev->lock);
+		dev->timeout_jiffies = msecs_to_jiffies(val);
+		check_timers(dev);
+		spin_unlock_bh(&dev->lock);
+		allocate_timeout_image(dev);
+		break;
+	case CID_TIMEOUT_IMAGE_IO:
+		dev->timeout_image_io = 1;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl)
+{
+	struct v4l2_loopback_device *dev = container_of(
+		ctrl->handler, struct v4l2_loopback_device, ctrl_handler);
+	return v4l2loopback_set_ctrl(dev, ctrl->id, ctrl->val);
+}
+
+/* returns set of device outputs, in our case there is only one
+ * called on VIDIOC_ENUMOUTPUT
+ */
+static int vidioc_enum_output(struct file *file, void *fh,
+			      struct v4l2_output *outp)
+{
+	__u32 index = outp->index;
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	MARK();
+
+	if (!dev->announce_all_caps && !dev->ready_for_output)
+		return -ENOTTY;
+
+	if (0 != index)
+		return -EINVAL;
+
+	/* clear all data (including the reserved fields) */
+	memset(outp, 0, sizeof(*outp));
+
+	outp->index = index;
+	strscpy(outp->name, "loopback in", sizeof(outp->name));
+	outp->type = V4L2_OUTPUT_TYPE_ANALOG;
+	outp->audioset = 0;
+	outp->modulator = 0;
+#ifdef V4L2LOOPBACK_WITH_STD
+	outp->std = V4L2_STD_ALL;
+#ifdef V4L2_OUT_CAP_STD
+	outp->capabilities |= V4L2_OUT_CAP_STD;
+#endif /*  V4L2_OUT_CAP_STD */
+#endif /* V4L2LOOPBACK_WITH_STD */
+
+	return 0;
+}
+
+/* which output is currently active,
+ * called on VIDIOC_G_OUTPUT
+ */
+static int vidioc_g_output(struct file *file, void *fh, unsigned int *i)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	if (!dev->announce_all_caps && !dev->ready_for_output)
+		return -ENOTTY;
+	if (i)
+		*i = 0;
+	return 0;
+}
+
+/* set output, can make sense if we have more than one video src,
+ * called on VIDIOC_S_OUTPUT
+ */
+static int vidioc_s_output(struct file *file, void *fh, unsigned int i)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	if (!dev->announce_all_caps && !dev->ready_for_output)
+		return -ENOTTY;
+
+	if (i)
+		return -EINVAL;
+
+	return 0;
+}
+
+/* returns set of device inputs, in our case there is only one,
+ * but later I may add more
+ * called on VIDIOC_ENUMINPUT
+ */
+static int vidioc_enum_input(struct file *file, void *fh,
+			     struct v4l2_input *inp)
+{
+	struct v4l2_loopback_device *dev;
+	__u32 index = inp->index;
+	MARK();
+
+	if (0 != index)
+		return -EINVAL;
+
+	/* clear all data (including the reserved fields) */
+	memset(inp, 0, sizeof(*inp));
+
+	inp->index = index;
+	strscpy(inp->name, "loopback", sizeof(inp->name));
+	inp->type = V4L2_INPUT_TYPE_CAMERA;
+	inp->audioset = 0;
+	inp->tuner = 0;
+	inp->status = 0;
+
+#ifdef V4L2LOOPBACK_WITH_STD
+	inp->std = V4L2_STD_ALL;
+#ifdef V4L2_IN_CAP_STD
+	inp->capabilities |= V4L2_IN_CAP_STD;
+#endif
+#endif /* V4L2LOOPBACK_WITH_STD */
+
+	dev = v4l2loopback_getdevice(file);
+	if (!dev->ready_for_capture) {
+		inp->status |= V4L2_IN_ST_NO_SIGNAL;
+	}
+
+	return 0;
+}
+
+/* which input is currently active,
+ * called on VIDIOC_G_INPUT
+ */
+static int vidioc_g_input(struct file *file, void *fh, unsigned int *i)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	if (!dev->announce_all_caps && !dev->ready_for_capture)
+		return -ENOTTY;
+	if (i)
+		*i = 0;
+	return 0;
+}
+
+/* set input, can make sense if we have more than one video src,
+ * called on VIDIOC_S_INPUT
+ */
+static int vidioc_s_input(struct file *file, void *fh, unsigned int i)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	if (!dev->announce_all_caps && !dev->ready_for_capture)
+		return -ENOTTY;
+	if (i == 0)
+		return 0;
+	return -EINVAL;
+}
+
+/* --------------- V4L2 ioctl buffer related calls ----------------- */
+
+/* negotiate buffer type
+ * only mmap streaming supported
+ * called on VIDIOC_REQBUFS
+ */
+static int vidioc_reqbufs(struct file *file, void *fh,
+			  struct v4l2_requestbuffers *b)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+	int i;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(fh);
+
+	dprintk("reqbufs: %d\t%d=%d\n", b->memory, b->count,
+		dev->buffers_number);
+
+	if (opener->timeout_image_io) {
+		dev->timeout_image_io = 0;
+		if (b->memory != V4L2_MEMORY_MMAP)
+			return -EINVAL;
+		b->count = 2;
+		return 0;
+	}
+
+	if (V4L2_TYPE_IS_OUTPUT(b->type) && (!dev->ready_for_output)) {
+		return -EBUSY;
+	}
+
+	init_buffers(dev);
+	switch (b->memory) {
+	case V4L2_MEMORY_MMAP:
+		/* do nothing here, buffers are always allocated */
+		if (b->count < 1 || dev->buffers_number < 1)
+			return 0;
+
+		if (b->count > dev->buffers_number)
+			b->count = dev->buffers_number;
+
+		/* make sure that outbufs_list contains buffers from 0 to used_buffers-1
+		 * actually, it will have been already populated via v4l2_loopback_init()
+		 * at this point */
+		if (list_empty(&dev->outbufs_list)) {
+			for (i = 0; i < dev->used_buffers; ++i)
+				list_add_tail(&dev->buffers[i].list_head,
+					      &dev->outbufs_list);
+		}
+
+		/* also, if dev->used_buffers is going to be decreased, we should remove
+		 * out-of-range buffers from outbufs_list, and fix bufpos2index mapping */
+		if (b->count < dev->used_buffers) {
+			struct v4l2l_buffer *pos, *n;
+
+			list_for_each_entry_safe(pos, n, &dev->outbufs_list,
+						 list_head) {
+				if (pos->buffer.index >= b->count)
+					list_del(&pos->list_head);
+			}
+
+			/* after we update dev->used_buffers, buffers in outbufs_list will
+			 * correspond to dev->write_position + [0;b->count-1] range */
+			i = v4l2l_mod64(dev->write_position, b->count);
+			list_for_each_entry(pos, &dev->outbufs_list,
+					    list_head) {
+				dev->bufpos2index[i % b->count] =
+					pos->buffer.index;
+				++i;
+			}
+		}
+
+		opener->buffers_number = b->count;
+		if (opener->buffers_number < dev->used_buffers)
+			dev->used_buffers = opener->buffers_number;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+/* returns buffer asked for;
+ * give app as many buffers as it wants, if it less than MAX,
+ * but map them in our inner buffers
+ * called on VIDIOC_QUERYBUF
+ */
+static int vidioc_querybuf(struct file *file, void *fh, struct v4l2_buffer *b)
+{
+	enum v4l2_buf_type type;
+	int index;
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+
+	MARK();
+
+	type = b->type;
+	index = b->index;
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(fh);
+
+	if ((b->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) &&
+	    (b->type != V4L2_BUF_TYPE_VIDEO_OUTPUT)) {
+		return -EINVAL;
+	}
+	if (b->index > max_buffers)
+		return -EINVAL;
+
+	if (opener->timeout_image_io)
+		*b = dev->timeout_image_buffer.buffer;
+	else
+		*b = dev->buffers[b->index % dev->used_buffers].buffer;
+
+	b->type = type;
+	b->index = index;
+	dprintkrw("buffer type: %d (of %d with size=%ld)\n", b->memory,
+		  dev->buffers_number, dev->buffer_size);
+
+	/*  Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture'
+            https://github.com/umlaeute/v4l2loopback/issues/60 */
+	b->flags &= ~V4L2_BUF_FLAG_DONE;
+	b->flags |= V4L2_BUF_FLAG_QUEUED;
+
+	return 0;
+}
+
+static void buffer_written(struct v4l2_loopback_device *dev,
+			   struct v4l2l_buffer *buf)
+{
+	del_timer_sync(&dev->sustain_timer);
+	del_timer_sync(&dev->timeout_timer);
+
+	spin_lock_bh(&dev->list_lock);
+	list_move_tail(&buf->list_head, &dev->outbufs_list);
+	spin_unlock_bh(&dev->list_lock);
+
+	spin_lock_bh(&dev->lock);
+	dev->bufpos2index[v4l2l_mod64(dev->write_position, dev->used_buffers)] =
+		buf->buffer.index;
+	++dev->write_position;
+	dev->reread_count = 0;
+
+	check_timers(dev);
+	spin_unlock_bh(&dev->lock);
+}
+
+/* put buffer to queue
+ * called on VIDIOC_QBUF
+ */
+static int vidioc_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+	struct v4l2l_buffer *b;
+	int index;
+
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(fh);
+
+	if (buf->index > max_buffers)
+		return -EINVAL;
+	if (opener->timeout_image_io)
+		return 0;
+
+	index = buf->index % dev->used_buffers;
+	b = &dev->buffers[index];
+
+	switch (buf->type) {
+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+		dprintkrw(
+			"qbuf(CAPTURE)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n",
+			index, buf->index, buf, buf->type, buf->bytesused,
+			buf->length, buf->flags, buf->field,
+			(long long)buf->timestamp.tv_sec,
+			(long int)buf->timestamp.tv_usec, buf->sequence);
+		set_queued(b);
+		return 0;
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+		dprintkrw(
+			"qbuf(OUTPUT)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n",
+			index, buf->index, buf, buf->type, buf->bytesused,
+			buf->length, buf->flags, buf->field,
+			(long long)buf->timestamp.tv_sec,
+			(long int)buf->timestamp.tv_usec, buf->sequence);
+		if ((!(b->buffer.flags & V4L2_BUF_FLAG_TIMESTAMP_COPY)) &&
+		    (buf->timestamp.tv_sec == 0 && buf->timestamp.tv_usec == 0))
+			v4l2l_get_timestamp(&b->buffer);
+		else {
+			b->buffer.timestamp = buf->timestamp;
+			b->buffer.flags |= V4L2_BUF_FLAG_TIMESTAMP_COPY;
+		}
+		if (dev->pix_format_has_valid_sizeimage) {
+			if (buf->bytesused >= dev->pix_format.sizeimage) {
+				b->buffer.bytesused = dev->pix_format.sizeimage;
+			} else {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)
+				dev_warn_ratelimited(
+					&dev->vdev->dev,
+#else
+				dprintkrw(
+#endif
+					"warning queued output buffer bytesused too small %d < %d\n",
+					buf->bytesused,
+					dev->pix_format.sizeimage);
+				b->buffer.bytesused = buf->bytesused;
+			}
+		} else {
+			b->buffer.bytesused = buf->bytesused;
+		}
+
+		set_done(b);
+		buffer_written(dev, b);
+
+		/*  Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture'
+                    https://github.com/umlaeute/v4l2loopback/issues/60 */
+		buf->flags &= ~V4L2_BUF_FLAG_DONE;
+		buf->flags |= V4L2_BUF_FLAG_QUEUED;
+
+		wake_up_all(&dev->read_event);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int can_read(struct v4l2_loopback_device *dev,
+		    struct v4l2_loopback_opener *opener)
+{
+	int ret;
+
+	spin_lock_bh(&dev->lock);
+	check_timers(dev);
+	ret = dev->write_position > opener->read_position ||
+	      dev->reread_count > opener->reread_count || dev->timeout_happened;
+	spin_unlock_bh(&dev->lock);
+	return ret;
+}
+
+static int get_capture_buffer(struct file *file)
+{
+	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
+	struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data);
+	int pos, ret;
+	int timeout_happened;
+
+	if ((file->f_flags & O_NONBLOCK) &&
+	    (dev->write_position <= opener->read_position &&
+	     dev->reread_count <= opener->reread_count &&
+	     !dev->timeout_happened))
+		return -EAGAIN;
+	wait_event_interruptible(dev->read_event, can_read(dev, opener));
+
+	spin_lock_bh(&dev->lock);
+	if (dev->write_position == opener->read_position) {
+		if (dev->reread_count > opener->reread_count + 2)
+			opener->reread_count = dev->reread_count - 1;
+		++opener->reread_count;
+		pos = v4l2l_mod64(opener->read_position + dev->used_buffers - 1,
+				  dev->used_buffers);
+	} else {
+		opener->reread_count = 0;
+		if (dev->write_position >
+		    opener->read_position + dev->used_buffers)
+			opener->read_position = dev->write_position - 1;
+		pos = v4l2l_mod64(opener->read_position, dev->used_buffers);
+		++opener->read_position;
+	}
+	timeout_happened = dev->timeout_happened;
+	dev->timeout_happened = 0;
+	spin_unlock_bh(&dev->lock);
+
+	ret = dev->bufpos2index[pos];
+	if (timeout_happened) {
+		if (ret < 0) {
+			dprintk("trying to return not mapped buf[%d]\n", ret);
+			return -EFAULT;
+		}
+		/* although allocated on-demand, timeout_image is freed only
+		 * in free_buffers(), so we don't need to worry about it being
+		 * deallocated suddenly */
+		memcpy(dev->image + dev->buffers[ret].buffer.m.offset,
+		       dev->timeout_image, dev->buffer_size);
+	}
+	return ret;
+}
+
+/* put buffer to dequeue
+ * called on VIDIOC_DQBUF
+ */
+static int vidioc_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+	int index;
+	struct v4l2l_buffer *b;
+
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(fh);
+	if (opener->timeout_image_io) {
+		*buf = dev->timeout_image_buffer.buffer;
+		return 0;
+	}
+
+	switch (buf->type) {
+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+		index = get_capture_buffer(file);
+		if (index < 0)
+			return index;
+		dprintkrw("capture DQBUF pos: %lld index: %d\n",
+			  (long long)(opener->read_position - 1), index);
+		if (!(dev->buffers[index].buffer.flags &
+		      V4L2_BUF_FLAG_MAPPED)) {
+			dprintk("trying to return not mapped buf[%d]\n", index);
+			return -EINVAL;
+		}
+		unset_flags(&dev->buffers[index]);
+		*buf = dev->buffers[index].buffer;
+		dprintkrw(
+			"dqbuf(CAPTURE)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n",
+			index, buf->index, buf, buf->type, buf->bytesused,
+			buf->length, buf->flags, buf->field,
+			(long long)buf->timestamp.tv_sec,
+			(long int)buf->timestamp.tv_usec, buf->sequence);
+		return 0;
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+		spin_lock_bh(&dev->list_lock);
+
+		b = list_entry(dev->outbufs_list.prev, struct v4l2l_buffer,
+			       list_head);
+		list_move_tail(&b->list_head, &dev->outbufs_list);
+
+		spin_unlock_bh(&dev->list_lock);
+		dprintkrw("output DQBUF index: %d\n", b->buffer.index);
+		unset_flags(b);
+		*buf = b->buffer;
+		buf->type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
+		dprintkrw(
+			"dqbuf(OUTPUT)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n",
+			index, buf->index, buf, buf->type, buf->bytesused,
+			buf->length, buf->flags, buf->field,
+			(long long)buf->timestamp.tv_sec,
+			(long int)buf->timestamp.tv_usec, buf->sequence);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+/* ------------- STREAMING ------------------- */
+
+/* start streaming
+ * called on VIDIOC_STREAMON
+ */
+static int vidioc_streamon(struct file *file, void *fh, enum v4l2_buf_type type)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(fh);
+
+	switch (type) {
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+		if (!dev->ready_for_capture) {
+			int ret = allocate_buffers(dev);
+			if (ret < 0)
+				return ret;
+		}
+		opener->type = WRITER;
+		dev->ready_for_output = 0;
+		dev->ready_for_capture++;
+		return 0;
+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+		if (!dev->ready_for_capture)
+			return -EIO;
+		if (dev->active_readers > 0)
+			return -EBUSY;
+		opener->type = READER;
+		dev->active_readers++;
+		client_usage_queue_event(dev->vdev);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+	return -EINVAL;
+}
+
+/* stop streaming
+ * called on VIDIOC_STREAMOFF
+ */
+static int vidioc_streamoff(struct file *file, void *fh,
+			    enum v4l2_buf_type type)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+
+	MARK();
+	dprintk("%d\n", type);
+
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(fh);
+	switch (type) {
+	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
+		if (dev->ready_for_capture > 0)
+			dev->ready_for_capture--;
+		return 0;
+	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
+		if (opener->type == READER) {
+			opener->type = 0;
+			dev->active_readers--;
+			client_usage_queue_event(dev->vdev);
+		}
+		return 0;
+	default:
+		return -EINVAL;
+	}
+	return -EINVAL;
+}
+
+#ifdef CONFIG_VIDEO_V4L1_COMPAT
+static int vidiocgmbuf(struct file *file, void *fh, struct video_mbuf *p)
+{
+	struct v4l2_loopback_device *dev;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+	p->frames = dev->buffers_number;
+	p->offsets[0] = 0;
+	p->offsets[1] = 0;
+	p->size = dev->buffer_size;
+	return 0;
+}
+#endif
+
+static void client_usage_queue_event(struct video_device *vdev)
+{
+	struct v4l2_event ev;
+	struct v4l2_loopback_device *dev;
+
+	dev = container_of(vdev->v4l2_dev, struct v4l2_loopback_device,
+			   v4l2_dev);
+
+	memset(&ev, 0, sizeof(ev));
+	ev.type = V4L2_EVENT_PRI_CLIENT_USAGE;
+	((struct v4l2_event_client_usage *)&ev.u)->count = dev->active_readers;
+
+	v4l2_event_queue(vdev, &ev);
+}
+
+static int client_usage_ops_add(struct v4l2_subscribed_event *sev,
+				unsigned elems)
+{
+	if (!(sev->flags & V4L2_EVENT_SUB_FL_SEND_INITIAL))
+		return 0;
+
+	client_usage_queue_event(sev->fh->vdev);
+	return 0;
+}
+
+static void client_usage_ops_replace(struct v4l2_event *old,
+				     const struct v4l2_event *new)
+{
+	*((struct v4l2_event_client_usage *)&old->u) =
+		*((struct v4l2_event_client_usage *)&new->u);
+}
+
+static void client_usage_ops_merge(const struct v4l2_event *old,
+				   struct v4l2_event *new)
+{
+	*((struct v4l2_event_client_usage *)&new->u) =
+		*((struct v4l2_event_client_usage *)&old->u);
+}
+
+const struct v4l2_subscribed_event_ops client_usage_ops = {
+	.add = client_usage_ops_add,
+	.replace = client_usage_ops_replace,
+	.merge = client_usage_ops_merge,
+};
+
+static int vidioc_subscribe_event(struct v4l2_fh *fh,
+				  const struct v4l2_event_subscription *sub)
+{
+	switch (sub->type) {
+	case V4L2_EVENT_CTRL:
+		return v4l2_ctrl_subscribe_event(fh, sub);
+	case V4L2_EVENT_PRI_CLIENT_USAGE:
+		return v4l2_event_subscribe(fh, sub, 0, &client_usage_ops);
+	}
+
+	return -EINVAL;
+}
+
+/* file operations */
+static void vm_open(struct vm_area_struct *vma)
+{
+	struct v4l2l_buffer *buf;
+	MARK();
+
+	buf = vma->vm_private_data;
+	buf->use_count++;
+
+	buf->buffer.flags |= V4L2_BUF_FLAG_MAPPED;
+}
+
+static void vm_close(struct vm_area_struct *vma)
+{
+	struct v4l2l_buffer *buf;
+	MARK();
+
+	buf = vma->vm_private_data;
+	buf->use_count--;
+
+	if (buf->use_count <= 0)
+		buf->buffer.flags &= ~V4L2_BUF_FLAG_MAPPED;
+}
+
+static struct vm_operations_struct vm_ops = {
+	.open = vm_open,
+	.close = vm_close,
+};
+
+static int v4l2_loopback_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	u8 *addr;
+	unsigned long start;
+	unsigned long size;
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+	struct v4l2l_buffer *buffer = NULL;
+	MARK();
+
+	start = (unsigned long)vma->vm_start;
+	size = (unsigned long)(vma->vm_end - vma->vm_start);
+
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(file->private_data);
+
+	if (size > dev->buffer_size) {
+		dprintk("userspace tries to mmap too much, fail\n");
+		return -EINVAL;
+	}
+	if (opener->timeout_image_io) {
+		/* we are going to map the timeout_image_buffer */
+		if ((vma->vm_pgoff << PAGE_SHIFT) !=
+		    dev->buffer_size * MAX_BUFFERS) {
+			dprintk("invalid mmap offset for timeout_image_io mode\n");
+			return -EINVAL;
+		}
+	} else if ((vma->vm_pgoff << PAGE_SHIFT) >
+		   dev->buffer_size * (dev->buffers_number - 1)) {
+		dprintk("userspace tries to mmap too far, fail\n");
+		return -EINVAL;
+	}
+
+	/* FIXXXXXME: allocation should not happen here! */
+	if (NULL == dev->image)
+		if (allocate_buffers(dev) < 0)
+			return -EINVAL;
+
+	if (opener->timeout_image_io) {
+		buffer = &dev->timeout_image_buffer;
+		addr = dev->timeout_image;
+	} else {
+		int i;
+		for (i = 0; i < dev->buffers_number; ++i) {
+			buffer = &dev->buffers[i];
+			if ((buffer->buffer.m.offset >> PAGE_SHIFT) ==
+			    vma->vm_pgoff)
+				break;
+		}
+
+		if (i >= dev->buffers_number)
+			return -EINVAL;
+
+		addr = dev->image + (vma->vm_pgoff << PAGE_SHIFT);
+	}
+
+	while (size > 0) {
+		struct page *page;
+
+		page = vmalloc_to_page(addr);
+
+		if (vm_insert_page(vma, start, page) < 0)
+			return -EAGAIN;
+
+		start += PAGE_SIZE;
+		addr += PAGE_SIZE;
+		size -= PAGE_SIZE;
+	}
+
+	vma->vm_ops = &vm_ops;
+	vma->vm_private_data = buffer;
+
+	vm_open(vma);
+
+	MARK();
+	return 0;
+}
+
+static unsigned int v4l2_loopback_poll(struct file *file,
+				       struct poll_table_struct *pts)
+{
+	struct v4l2_loopback_opener *opener;
+	struct v4l2_loopback_device *dev;
+	__poll_t req_events = poll_requested_events(pts);
+	int ret_mask = 0;
+	MARK();
+
+	opener = fh_to_opener(file->private_data);
+	dev = v4l2loopback_getdevice(file);
+
+	if (req_events & POLLPRI) {
+		if (!v4l2_event_pending(&opener->fh))
+			poll_wait(file, &opener->fh.wait, pts);
+		if (v4l2_event_pending(&opener->fh)) {
+			ret_mask |= POLLPRI;
+			if (!(req_events & DEFAULT_POLLMASK))
+				return ret_mask;
+		}
+	}
+
+	switch (opener->type) {
+	case WRITER:
+		ret_mask |= POLLOUT | POLLWRNORM;
+		break;
+	case READER:
+		if (!can_read(dev, opener)) {
+			if (ret_mask)
+				return ret_mask;
+			poll_wait(file, &dev->read_event, pts);
+		}
+		if (can_read(dev, opener))
+			ret_mask |= POLLIN | POLLRDNORM;
+		if (v4l2_event_pending(&opener->fh))
+			ret_mask |= POLLPRI;
+		break;
+	default:
+		break;
+	}
+
+	MARK();
+	return ret_mask;
+}
+
+/* do not want to limit device opens, it can be as many readers as user want,
+ * writers are limited by means of setting writer field */
+static int v4l2_loopback_open(struct file *file)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_opener *opener;
+	MARK();
+	dev = v4l2loopback_getdevice(file);
+	if (dev->open_count.counter >= dev->max_openers)
+		return -EBUSY;
+	/* kfree on close */
+	opener = kzalloc(sizeof(*opener), GFP_KERNEL);
+	if (opener == NULL)
+		return -ENOMEM;
+
+	atomic_inc(&dev->open_count);
+
+	opener->timeout_image_io = dev->timeout_image_io;
+	if (opener->timeout_image_io) {
+		int r = allocate_timeout_image(dev);
+
+		if (r < 0) {
+			dprintk("timeout image allocation failed\n");
+
+			atomic_dec(&dev->open_count);
+
+			kfree(opener);
+			return r;
+		}
+	}
+
+	v4l2_fh_init(&opener->fh, video_devdata(file));
+	file->private_data = &opener->fh;
+
+	v4l2_fh_add(&opener->fh);
+	dprintk("opened dev:%p with image:%p\n", dev, dev ? dev->image : NULL);
+	MARK();
+	return 0;
+}
+
+static int v4l2_loopback_close(struct file *file)
+{
+	struct v4l2_loopback_opener *opener;
+	struct v4l2_loopback_device *dev;
+	int is_writer = 0, is_reader = 0;
+	MARK();
+
+	opener = fh_to_opener(file->private_data);
+	dev = v4l2loopback_getdevice(file);
+
+	if (WRITER == opener->type)
+		is_writer = 1;
+	if (READER == opener->type)
+		is_reader = 1;
+
+	atomic_dec(&dev->open_count);
+	if (dev->open_count.counter == 0) {
+		del_timer_sync(&dev->sustain_timer);
+		del_timer_sync(&dev->timeout_timer);
+	}
+	try_free_buffers(dev);
+
+	v4l2_fh_del(&opener->fh);
+	v4l2_fh_exit(&opener->fh);
+
+	kfree(opener);
+	if (is_writer)
+		dev->ready_for_output = 1;
+	if (is_reader) {
+		dev->active_readers--;
+		client_usage_queue_event(dev->vdev);
+	}
+	MARK();
+	return 0;
+}
+
+static ssize_t v4l2_loopback_read(struct file *file, char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	int read_index;
+	struct v4l2_loopback_device *dev;
+	struct v4l2_buffer *b;
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+
+	read_index = get_capture_buffer(file);
+	if (read_index < 0)
+		return read_index;
+	if (count > dev->buffer_size)
+		count = dev->buffer_size;
+	b = &dev->buffers[read_index].buffer;
+	if (count > b->bytesused)
+		count = b->bytesused;
+	if (copy_to_user((void *)buf, (void *)(dev->image + b->m.offset),
+			 count)) {
+		printk(KERN_ERR
+		       "v4l2-loopback: failed copy_to_user() in read buf\n");
+		return -EFAULT;
+	}
+	dprintkrw("leave v4l2_loopback_read()\n");
+	return count;
+}
+
+static ssize_t v4l2_loopback_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct v4l2_loopback_opener *opener;
+	struct v4l2_loopback_device *dev;
+	int write_index;
+	struct v4l2_buffer *b;
+	int err = 0;
+
+	MARK();
+
+	dev = v4l2loopback_getdevice(file);
+	opener = fh_to_opener(file->private_data);
+
+	if (UNNEGOTIATED == opener->type) {
+		spin_lock(&dev->lock);
+
+		if (dev->ready_for_output) {
+			err = vidioc_streamon(file, file->private_data,
+					      V4L2_BUF_TYPE_VIDEO_OUTPUT);
+		}
+
+		spin_unlock(&dev->lock);
+
+		if (err < 0)
+			return err;
+	}
+
+	if (WRITER != opener->type)
+		return -EINVAL;
+
+	if (!dev->ready_for_capture) {
+		int ret = allocate_buffers(dev);
+		if (ret < 0)
+			return ret;
+		dev->ready_for_capture = 1;
+	}
+	dprintkrw("v4l2_loopback_write() trying to write %zu bytes\n", count);
+	if (count > dev->buffer_size)
+		count = dev->buffer_size;
+
+	write_index = v4l2l_mod64(dev->write_position, dev->used_buffers);
+	b = &dev->buffers[write_index].buffer;
+
+	if (copy_from_user((void *)(dev->image + b->m.offset), (void *)buf,
+			   count)) {
+		printk(KERN_ERR
+		       "v4l2-loopback: failed copy_from_user() in write buf, could not write %zu\n",
+		       count);
+		return -EFAULT;
+	}
+	v4l2l_get_timestamp(b);
+	b->bytesused = count;
+	b->sequence = dev->write_position;
+	buffer_written(dev, &dev->buffers[write_index]);
+	wake_up_all(&dev->read_event);
+	dprintkrw("leave v4l2_loopback_write()\n");
+	return count;
+}
+
+/* init functions */
+/* frees buffers, if already allocated */
+static void free_buffers(struct v4l2_loopback_device *dev)
+{
+	MARK();
+	dprintk("freeing image@%p for dev:%p\n", dev ? dev->image : NULL, dev);
+	if (!dev)
+		return;
+	if (dev->image) {
+		vfree(dev->image);
+		dev->image = NULL;
+	}
+	if (dev->timeout_image) {
+		vfree(dev->timeout_image);
+		dev->timeout_image = NULL;
+	}
+	dev->imagesize = 0;
+}
+/* frees buffers, if they are no longer needed */
+static void try_free_buffers(struct v4l2_loopback_device *dev)
+{
+	MARK();
+	if (0 == dev->open_count.counter && !dev->keep_format) {
+		free_buffers(dev);
+		dev->ready_for_capture = 0;
+		dev->buffer_size = 0;
+		dev->write_position = 0;
+	}
+}
+/* allocates buffers, if buffer_size is set */
+static int allocate_buffers(struct v4l2_loopback_device *dev)
+{
+	int err;
+
+	MARK();
+	/* vfree on close file operation in case no open handles left */
+
+	if (dev->buffer_size < 1 || dev->buffers_number < 1)
+		return -EINVAL;
+
+	if ((__LONG_MAX__ / dev->buffer_size) < dev->buffers_number)
+		return -ENOSPC;
+
+	if (dev->image) {
+		dprintk("allocating buffers again: %ld %ld\n",
+			dev->buffer_size * dev->buffers_number, dev->imagesize);
+		/* FIXME: prevent double allocation more intelligently! */
+		if (dev->buffer_size * dev->buffers_number == dev->imagesize)
+			return 0;
+
+		/* check whether the total number of readers/writers is <=1 */
+		if ((dev->ready_for_capture + dev->active_readers) <= 1)
+			free_buffers(dev);
+		else
+			return -EINVAL;
+	}
+
+	dev->imagesize = (unsigned long)dev->buffer_size *
+			 (unsigned long)dev->buffers_number;
+
+	dprintk("allocating %ld = %ldx%d\n", dev->imagesize, dev->buffer_size,
+		dev->buffers_number);
+	err = -ENOMEM;
+
+	if (dev->timeout_jiffies > 0) {
+		err = allocate_timeout_image(dev);
+		if (err < 0)
+			goto error;
+	}
+
+	dev->image = vmalloc(dev->imagesize);
+	if (dev->image == NULL)
+		goto error;
+
+	dprintk("vmallocated %ld bytes\n", dev->imagesize);
+	MARK();
+
+	init_buffers(dev);
+	return 0;
+
+error:
+	free_buffers(dev);
+	return err;
+}
+
+/* init inner buffers, they are capture mode and flags are set as
+ * for capture mod buffers */
+static void init_buffers(struct v4l2_loopback_device *dev)
+{
+	int i;
+	int buffer_size;
+	int bytesused;
+	MARK();
+
+	buffer_size = dev->buffer_size;
+	bytesused = dev->pix_format.sizeimage;
+	for (i = 0; i < dev->buffers_number; ++i) {
+		struct v4l2_buffer *b = &dev->buffers[i].buffer;
+		b->index = i;
+		b->bytesused = bytesused;
+		b->length = buffer_size;
+		b->field = V4L2_FIELD_NONE;
+		b->flags = 0;
+		b->m.offset = i * buffer_size;
+		b->memory = V4L2_MEMORY_MMAP;
+		b->sequence = 0;
+		b->timestamp.tv_sec = 0;
+		b->timestamp.tv_usec = 0;
+		b->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+
+		v4l2l_get_timestamp(b);
+	}
+	dev->timeout_image_buffer = dev->buffers[0];
+	dev->timeout_image_buffer.buffer.m.offset = MAX_BUFFERS * buffer_size;
+	MARK();
+}
+
+static int allocate_timeout_image(struct v4l2_loopback_device *dev)
+{
+	MARK();
+	if (dev->buffer_size <= 0) {
+		dev->timeout_image_io = 0;
+		return -EINVAL;
+	}
+
+	if (dev->timeout_image == NULL) {
+		dev->timeout_image = vzalloc(dev->buffer_size);
+		if (dev->timeout_image == NULL) {
+			dev->timeout_image_io = 0;
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+/* fills and register video device */
+static void init_vdev(struct video_device *vdev, int nr)
+{
+	MARK();
+
+#ifdef V4L2LOOPBACK_WITH_STD
+	vdev->tvnorms = V4L2_STD_ALL;
+#endif /* V4L2LOOPBACK_WITH_STD */
+
+	vdev->vfl_type = VFL_TYPE_VIDEO;
+	vdev->fops = &v4l2_loopback_fops;
+	vdev->ioctl_ops = &v4l2_loopback_ioctl_ops;
+	vdev->release = &video_device_release;
+	vdev->minor = -1;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
+	vdev->device_caps = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_VIDEO_CAPTURE |
+			    V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_READWRITE |
+			    V4L2_CAP_STREAMING;
+#endif
+
+	if (debug > 1)
+		vdev->dev_debug = V4L2_DEV_DEBUG_IOCTL |
+				  V4L2_DEV_DEBUG_IOCTL_ARG;
+
+	vdev->vfl_dir = VFL_DIR_M2M;
+
+	MARK();
+}
+
+/* init default capture parameters, only fps may be changed in future */
+static void init_capture_param(struct v4l2_captureparm *capture_param)
+{
+	MARK();
+	capture_param->capability = 0;
+	capture_param->capturemode = 0;
+	capture_param->extendedmode = 0;
+	capture_param->readbuffers = max_buffers;
+	capture_param->timeperframe.numerator = 1;
+	capture_param->timeperframe.denominator = 30;
+}
+
+static void check_timers(struct v4l2_loopback_device *dev)
+{
+	if (!dev->ready_for_capture)
+		return;
+
+	if (dev->timeout_jiffies > 0 && !timer_pending(&dev->timeout_timer))
+		mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies);
+	if (dev->sustain_framerate && !timer_pending(&dev->sustain_timer))
+		mod_timer(&dev->sustain_timer,
+			  jiffies + dev->frame_jiffies * 3 / 2);
+}
+#ifdef HAVE_TIMER_SETUP
+static void sustain_timer_clb(struct timer_list *t)
+{
+	struct v4l2_loopback_device *dev = from_timer(dev, t, sustain_timer);
+#else
+static void sustain_timer_clb(unsigned long nr)
+{
+	struct v4l2_loopback_device *dev =
+		idr_find(&v4l2loopback_index_idr, nr);
+#endif
+	spin_lock(&dev->lock);
+	if (dev->sustain_framerate) {
+		dev->reread_count++;
+		dprintkrw("reread: %lld %d\n", (long long)dev->write_position,
+			  dev->reread_count);
+		if (dev->reread_count == 1)
+			mod_timer(&dev->sustain_timer,
+				  jiffies + max(1UL, dev->frame_jiffies / 2));
+		else
+			mod_timer(&dev->sustain_timer,
+				  jiffies + dev->frame_jiffies);
+		wake_up_all(&dev->read_event);
+	}
+	spin_unlock(&dev->lock);
+}
+#ifdef HAVE_TIMER_SETUP
+static void timeout_timer_clb(struct timer_list *t)
+{
+	struct v4l2_loopback_device *dev = from_timer(dev, t, timeout_timer);
+#else
+static void timeout_timer_clb(unsigned long nr)
+{
+	struct v4l2_loopback_device *dev =
+		idr_find(&v4l2loopback_index_idr, nr);
+#endif
+	spin_lock(&dev->lock);
+	if (dev->timeout_jiffies > 0) {
+		dev->timeout_happened = 1;
+		mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies);
+		wake_up_all(&dev->read_event);
+	}
+	spin_unlock(&dev->lock);
+}
+
+/* init loopback main structure */
+#define DEFAULT_FROM_CONF(confmember, default_condition, default_value)        \
+	((conf) ?                                                              \
+		 ((conf->confmember default_condition) ? (default_value) :     \
+							 (conf->confmember)) : \
+		 default_value)
+
+static int v4l2_loopback_add(struct v4l2_loopback_config *conf, int *ret_nr)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_ctrl_handler *hdl;
+	struct v4l2loopback_private *vdev_priv = NULL;
+
+	int err = -ENOMEM;
+
+	u32 _width = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH;
+	u32 _height = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT;
+
+	u32 _min_width = DEFAULT_FROM_CONF(min_width,
+					   < V4L2LOOPBACK_SIZE_MIN_WIDTH,
+					   V4L2LOOPBACK_SIZE_MIN_WIDTH);
+	u32 _min_height = DEFAULT_FROM_CONF(min_height,
+					    < V4L2LOOPBACK_SIZE_MIN_HEIGHT,
+					    V4L2LOOPBACK_SIZE_MIN_HEIGHT);
+	u32 _max_width = DEFAULT_FROM_CONF(max_width, < _min_width, max_width);
+	u32 _max_height =
+		DEFAULT_FROM_CONF(max_height, < _min_height, max_height);
+	bool _announce_all_caps = (conf && conf->announce_all_caps >= 0) ?
+					  (conf->announce_all_caps) :
+					  V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS;
+	int _max_buffers = DEFAULT_FROM_CONF(max_buffers, <= 0, max_buffers);
+	int _max_openers = DEFAULT_FROM_CONF(max_openers, <= 0, max_openers);
+
+	int nr = -1;
+
+	_announce_all_caps = (!!_announce_all_caps);
+
+	if (conf) {
+		const int output_nr = conf->output_nr;
+#ifdef SPLIT_DEVICES
+		const int capture_nr = conf->capture_nr;
+#else
+		const int capture_nr = output_nr;
+#endif
+		if (capture_nr >= 0 && output_nr == capture_nr) {
+			nr = output_nr;
+		} else if (capture_nr < 0 && output_nr < 0) {
+			nr = -1;
+		} else if (capture_nr < 0) {
+			nr = output_nr;
+		} else if (output_nr < 0) {
+			nr = capture_nr;
+		} else {
+			printk(KERN_ERR
+			       "split OUTPUT and CAPTURE devices not yet supported.");
+			printk(KERN_INFO
+			       "both devices must have the same number (%d != %d).",
+			       output_nr, capture_nr);
+			return -EINVAL;
+		}
+	}
+
+	if (idr_find(&v4l2loopback_index_idr, nr))
+		return -EEXIST;
+
+	dprintk("creating v4l2loopback-device #%d\n", nr);
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	/* allocate id, if @id >= 0, we're requesting that specific id */
+	if (nr >= 0) {
+		err = idr_alloc(&v4l2loopback_index_idr, dev, nr, nr + 1,
+				GFP_KERNEL);
+		if (err == -ENOSPC)
+			err = -EEXIST;
+	} else {
+		err = idr_alloc(&v4l2loopback_index_idr, dev, 0, 0, GFP_KERNEL);
+	}
+	if (err < 0)
+		goto out_free_dev;
+	nr = err;
+	err = -ENOMEM;
+
+	if (conf && conf->card_label[0]) {
+		snprintf(dev->card_label, sizeof(dev->card_label), "%s",
+			 conf->card_label);
+	} else {
+		snprintf(dev->card_label, sizeof(dev->card_label),
+			 "Dummy video device (0x%04X)", nr);
+	}
+	snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name),
+		 "v4l2loopback-%03d", nr);
+
+	err = v4l2_device_register(NULL, &dev->v4l2_dev);
+	if (err)
+		goto out_free_idr;
+	MARK();
+
+	dev->vdev = video_device_alloc();
+	if (dev->vdev == NULL) {
+		err = -ENOMEM;
+		goto out_unregister;
+	}
+
+	vdev_priv = kzalloc(sizeof(struct v4l2loopback_private), GFP_KERNEL);
+	if (vdev_priv == NULL) {
+		err = -ENOMEM;
+		goto out_unregister;
+	}
+
+	video_set_drvdata(dev->vdev, vdev_priv);
+	if (video_get_drvdata(dev->vdev) == NULL) {
+		err = -ENOMEM;
+		goto out_unregister;
+	}
+
+	MARK();
+	snprintf(dev->vdev->name, sizeof(dev->vdev->name), "%s",
+		 dev->card_label);
+
+	vdev_priv->device_nr = nr;
+
+	init_vdev(dev->vdev, nr);
+	dev->vdev->v4l2_dev = &dev->v4l2_dev;
+	init_capture_param(&dev->capture_param);
+	err = set_timeperframe(dev, &dev->capture_param.timeperframe);
+	if (err)
+		goto out_unregister;
+	dev->keep_format = 0;
+	dev->sustain_framerate = 0;
+
+	dev->announce_all_caps = _announce_all_caps;
+	dev->min_width = _min_width;
+	dev->min_height = _min_height;
+	dev->max_width = _max_width;
+	dev->max_height = _max_height;
+	dev->max_openers = _max_openers;
+	dev->buffers_number = dev->used_buffers = _max_buffers;
+
+	dev->write_position = 0;
+
+	MARK();
+	spin_lock_init(&dev->lock);
+	spin_lock_init(&dev->list_lock);
+	INIT_LIST_HEAD(&dev->outbufs_list);
+	if (list_empty(&dev->outbufs_list)) {
+		int i;
+
+		for (i = 0; i < dev->used_buffers; ++i)
+			list_add_tail(&dev->buffers[i].list_head,
+				      &dev->outbufs_list);
+	}
+	memset(dev->bufpos2index, 0, sizeof(dev->bufpos2index));
+	atomic_set(&dev->open_count, 0);
+	dev->ready_for_capture = 0;
+	dev->ready_for_output = 1;
+
+	dev->buffer_size = 0;
+	dev->image = NULL;
+	dev->imagesize = 0;
+#ifdef HAVE_TIMER_SETUP
+	timer_setup(&dev->sustain_timer, sustain_timer_clb, 0);
+	timer_setup(&dev->timeout_timer, timeout_timer_clb, 0);
+#else
+	setup_timer(&dev->sustain_timer, sustain_timer_clb, nr);
+	setup_timer(&dev->timeout_timer, timeout_timer_clb, nr);
+#endif
+	dev->reread_count = 0;
+	dev->timeout_jiffies = 0;
+	dev->timeout_image = NULL;
+	dev->timeout_happened = 0;
+
+	hdl = &dev->ctrl_handler;
+	err = v4l2_ctrl_handler_init(hdl, 4);
+	if (err)
+		goto out_unregister;
+	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_keepformat, NULL);
+	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_sustainframerate, NULL);
+	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeout, NULL);
+	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeoutimageio, NULL);
+	if (hdl->error) {
+		err = hdl->error;
+		goto out_free_handler;
+	}
+	dev->v4l2_dev.ctrl_handler = hdl;
+
+	err = v4l2_ctrl_handler_setup(hdl);
+	if (err)
+		goto out_free_handler;
+
+	/* FIXME set buffers to 0 */
+
+	/* Set initial format */
+	if (_width < _min_width)
+		_width = _min_width;
+	if (_width > _max_width)
+		_width = _max_width;
+	if (_height < _min_height)
+		_height = _min_height;
+	if (_height > _max_height)
+		_height = _max_height;
+
+	dev->pix_format.width = _width;
+	dev->pix_format.height = _height;
+	dev->pix_format.pixelformat = formats[0].fourcc;
+	dev->pix_format.colorspace =
+		V4L2_COLORSPACE_DEFAULT; /* do we need to set this ? */
+	dev->pix_format.field = V4L2_FIELD_NONE;
+
+	dev->buffer_size = PAGE_ALIGN(dev->pix_format.sizeimage);
+	dprintk("buffer_size = %ld (=%d)\n", dev->buffer_size,
+		dev->pix_format.sizeimage);
+
+	if (dev->buffer_size && ((err = allocate_buffers(dev)) < 0))
+		goto out_free_handler;
+
+	init_waitqueue_head(&dev->read_event);
+
+	/* register the device -> it creates /dev/video* */
+	if (video_register_device(dev->vdev, VFL_TYPE_VIDEO, nr) < 0) {
+		printk(KERN_ERR
+		       "v4l2loopback: failed video_register_device()\n");
+		err = -EFAULT;
+		goto out_free_device;
+	}
+	v4l2loopback_create_sysfs(dev->vdev);
+
+	MARK();
+	if (ret_nr)
+		*ret_nr = dev->vdev->num;
+	return 0;
+
+out_free_device:
+	video_device_release(dev->vdev);
+out_free_handler:
+	v4l2_ctrl_handler_free(&dev->ctrl_handler);
+out_unregister:
+	video_set_drvdata(dev->vdev, NULL);
+	if (vdev_priv != NULL)
+		kfree(vdev_priv);
+	v4l2_device_unregister(&dev->v4l2_dev);
+out_free_idr:
+	idr_remove(&v4l2loopback_index_idr, nr);
+out_free_dev:
+	kfree(dev);
+	return err;
+}
+
+static void v4l2_loopback_remove(struct v4l2_loopback_device *dev)
+{
+	free_buffers(dev);
+	v4l2loopback_remove_sysfs(dev->vdev);
+	kfree(video_get_drvdata(dev->vdev));
+	video_unregister_device(dev->vdev);
+	v4l2_device_unregister(&dev->v4l2_dev);
+	v4l2_ctrl_handler_free(&dev->ctrl_handler);
+	kfree(dev);
+}
+
+static long v4l2loopback_control_ioctl(struct file *file, unsigned int cmd,
+				       unsigned long parm)
+{
+	struct v4l2_loopback_device *dev;
+	struct v4l2_loopback_config conf;
+	struct v4l2_loopback_config *confptr = &conf;
+	int device_nr, capture_nr, output_nr;
+	int ret;
+
+	ret = mutex_lock_killable(&v4l2loopback_ctl_mutex);
+	if (ret)
+		return ret;
+
+	ret = -EINVAL;
+	switch (cmd) {
+	default:
+		ret = -ENOSYS;
+		break;
+		/* add a v4l2loopback device (pair), based on the user-provided specs */
+	case V4L2LOOPBACK_CTL_ADD:
+		if (parm) {
+			if ((ret = copy_from_user(&conf, (void *)parm,
+						  sizeof(conf))) < 0)
+				break;
+		} else
+			confptr = NULL;
+		ret = v4l2_loopback_add(confptr, &device_nr);
+		if (ret >= 0)
+			ret = device_nr;
+		break;
+		/* remove a v4l2loopback device (both capture and output) */
+	case V4L2LOOPBACK_CTL_REMOVE:
+		ret = v4l2loopback_lookup((int)parm, &dev);
+		if (ret >= 0 && dev) {
+			int nr = ret;
+			ret = -EBUSY;
+			if (dev->open_count.counter > 0)
+				break;
+			idr_remove(&v4l2loopback_index_idr, nr);
+			v4l2_loopback_remove(dev);
+			ret = 0;
+		};
+		break;
+		/* get information for a loopback device.
+                 * this is mostly about limits (which cannot be queried directly with  VIDIOC_G_FMT and friends
+                 */
+	case V4L2LOOPBACK_CTL_QUERY:
+		if (!parm)
+			break;
+		if ((ret = copy_from_user(&conf, (void *)parm, sizeof(conf))) <
+		    0)
+			break;
+		capture_nr = output_nr = conf.output_nr;
+#ifdef SPLIT_DEVICES
+		capture_nr = conf.capture_nr;
+#endif
+		device_nr = (output_nr < 0) ? capture_nr : output_nr;
+		MARK();
+		/* get the device from either capture_nr or output_nr (whatever is valid) */
+		if ((ret = v4l2loopback_lookup(device_nr, &dev)) < 0)
+			break;
+		MARK();
+		/* if we got the device from output_nr and there is a valid capture_nr,
+                 * make sure that both refer to the same device (or bail out)
+                 */
+		if ((device_nr != capture_nr) && (capture_nr >= 0) &&
+		    ((ret = v4l2loopback_lookup(capture_nr, 0)) < 0))
+			break;
+		MARK();
+		/* if otoh, we got the device from capture_nr and there is a valid output_nr,
+                 * make sure that both refer to the same device (or bail out)
+                 */
+		if ((device_nr != output_nr) && (output_nr >= 0) &&
+		    ((ret = v4l2loopback_lookup(output_nr, 0)) < 0))
+			break;
+		MARK();
+
+		/* v4l2_loopback_config identified a single device, so fetch the data */
+		snprintf(conf.card_label, sizeof(conf.card_label), "%s",
+			 dev->card_label);
+		MARK();
+		conf.output_nr = dev->vdev->num;
+#ifdef SPLIT_DEVICES
+		conf.capture_nr = dev->vdev->num;
+#endif
+		conf.min_width = dev->min_width;
+		conf.min_height = dev->min_height;
+		conf.max_width = dev->max_width;
+		conf.max_height = dev->max_height;
+		conf.announce_all_caps = dev->announce_all_caps;
+		conf.max_buffers = dev->buffers_number;
+		conf.max_openers = dev->max_openers;
+		conf.debug = debug;
+		MARK();
+		if (copy_to_user((void *)parm, &conf, sizeof(conf))) {
+			ret = -EFAULT;
+			break;
+		}
+		MARK();
+		ret = 0;
+		;
+		break;
+	}
+
+	MARK();
+	mutex_unlock(&v4l2loopback_ctl_mutex);
+	MARK();
+	return ret;
+}
+
+/* LINUX KERNEL */
+
+static const struct file_operations v4l2loopback_ctl_fops = {
+	// clang-format off
+	.owner		= THIS_MODULE,
+	.open		= nonseekable_open,
+	.unlocked_ioctl	= v4l2loopback_control_ioctl,
+	.compat_ioctl	= v4l2loopback_control_ioctl,
+	.llseek		= noop_llseek,
+	// clang-format on
+};
+
+static struct miscdevice v4l2loopback_misc = {
+	// clang-format off
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "v4l2loopback",
+	.fops		= &v4l2loopback_ctl_fops,
+	// clang-format on
+};
+
+static const struct v4l2_file_operations v4l2_loopback_fops = {
+	// clang-format off
+	.owner		= THIS_MODULE,
+	.open		= v4l2_loopback_open,
+	.release	= v4l2_loopback_close,
+	.read		= v4l2_loopback_read,
+	.write		= v4l2_loopback_write,
+	.poll		= v4l2_loopback_poll,
+	.mmap		= v4l2_loopback_mmap,
+	.unlocked_ioctl	= video_ioctl2,
+	// clang-format on
+};
+
+static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops = {
+	// clang-format off
+	.vidioc_querycap		= &vidioc_querycap,
+	.vidioc_enum_framesizes		= &vidioc_enum_framesizes,
+	.vidioc_enum_frameintervals	= &vidioc_enum_frameintervals,
+
+	.vidioc_enum_output		= &vidioc_enum_output,
+	.vidioc_g_output		= &vidioc_g_output,
+	.vidioc_s_output		= &vidioc_s_output,
+
+	.vidioc_enum_input		= &vidioc_enum_input,
+	.vidioc_g_input			= &vidioc_g_input,
+	.vidioc_s_input			= &vidioc_s_input,
+
+	.vidioc_enum_fmt_vid_cap	= &vidioc_enum_fmt_cap,
+	.vidioc_g_fmt_vid_cap		= &vidioc_g_fmt_cap,
+	.vidioc_s_fmt_vid_cap		= &vidioc_s_fmt_cap,
+	.vidioc_try_fmt_vid_cap		= &vidioc_try_fmt_cap,
+
+	.vidioc_enum_fmt_vid_out	= &vidioc_enum_fmt_out,
+	.vidioc_s_fmt_vid_out		= &vidioc_s_fmt_out,
+	.vidioc_g_fmt_vid_out		= &vidioc_g_fmt_out,
+	.vidioc_try_fmt_vid_out		= &vidioc_try_fmt_out,
+
+#ifdef V4L2L_OVERLAY
+	.vidioc_s_fmt_vid_overlay	= &vidioc_s_fmt_overlay,
+	.vidioc_g_fmt_vid_overlay	= &vidioc_g_fmt_overlay,
+#endif
+
+#ifdef V4L2LOOPBACK_WITH_STD
+	.vidioc_s_std			= &vidioc_s_std,
+	.vidioc_g_std			= &vidioc_g_std,
+	.vidioc_querystd		= &vidioc_querystd,
+#endif /* V4L2LOOPBACK_WITH_STD */
+
+	.vidioc_g_parm			= &vidioc_g_parm,
+	.vidioc_s_parm			= &vidioc_s_parm,
+
+	.vidioc_reqbufs			= &vidioc_reqbufs,
+	.vidioc_querybuf		= &vidioc_querybuf,
+	.vidioc_qbuf			= &vidioc_qbuf,
+	.vidioc_dqbuf			= &vidioc_dqbuf,
+
+	.vidioc_streamon		= &vidioc_streamon,
+	.vidioc_streamoff		= &vidioc_streamoff,
+
+#ifdef CONFIG_VIDEO_V4L1_COMPAT
+	.vidiocgmbuf			= &vidiocgmbuf,
+#endif
+
+	.vidioc_subscribe_event		= &vidioc_subscribe_event,
+	.vidioc_unsubscribe_event	= &v4l2_event_unsubscribe,
+	// clang-format on
+};
+
+static int free_device_cb(int id, void *ptr, void *data)
+{
+	struct v4l2_loopback_device *dev = ptr;
+	v4l2_loopback_remove(dev);
+	return 0;
+}
+static void free_devices(void)
+{
+	idr_for_each(&v4l2loopback_index_idr, &free_device_cb, NULL);
+	idr_destroy(&v4l2loopback_index_idr);
+}
+
+static int __init v4l2loopback_init_module(void)
+{
+	const u32 min_width = V4L2LOOPBACK_SIZE_MIN_WIDTH;
+	const u32 min_height = V4L2LOOPBACK_SIZE_MIN_HEIGHT;
+	int err;
+	int i;
+	MARK();
+
+	err = misc_register(&v4l2loopback_misc);
+	if (err < 0)
+		return err;
+
+	if (devices < 0) {
+		devices = 1;
+
+		/* try guessing the devices from the "video_nr" parameter */
+		for (i = MAX_DEVICES - 1; i >= 0; i--) {
+			if (video_nr[i] >= 0) {
+				devices = i + 1;
+				break;
+			}
+		}
+	}
+
+	if (devices > MAX_DEVICES) {
+		devices = MAX_DEVICES;
+		printk(KERN_INFO
+		       "v4l2loopback: number of initial devices is limited to: %d\n",
+		       MAX_DEVICES);
+	}
+
+	if (max_buffers > MAX_BUFFERS) {
+		max_buffers = MAX_BUFFERS;
+		printk(KERN_INFO
+		       "v4l2loopback: number of buffers is limited to: %d\n",
+		       MAX_BUFFERS);
+	}
+
+	if (max_openers < 0) {
+		printk(KERN_INFO
+		       "v4l2loopback: allowing %d openers rather than %d\n",
+		       2, max_openers);
+		max_openers = 2;
+	}
+
+	if (max_width < min_width) {
+		max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH;
+		printk(KERN_INFO "v4l2loopback: using max_width %d\n",
+		       max_width);
+	}
+	if (max_height < min_height) {
+		max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT;
+		printk(KERN_INFO "v4l2loopback: using max_height %d\n",
+		       max_height);
+	}
+
+	for (i = 0; i < devices; i++) {
+		struct v4l2_loopback_config cfg = {
+			// clang-format off
+			.output_nr		= video_nr[i],
+#ifdef SPLIT_DEVICES
+			.capture_nr		= video_nr[i],
+#endif
+			.min_width		= min_width,
+			.min_height		= min_height,
+			.max_width		= max_width,
+			.max_height		= max_height,
+			.announce_all_caps	= (!exclusive_caps[i]),
+			.max_buffers		= max_buffers,
+			.max_openers		= max_openers,
+			.debug			= debug,
+			// clang-format on
+		};
+		cfg.card_label[0] = 0;
+		if (card_label[i])
+			snprintf(cfg.card_label, sizeof(cfg.card_label), "%s",
+				 card_label[i]);
+		err = v4l2_loopback_add(&cfg, 0);
+		if (err) {
+			free_devices();
+			goto error;
+		}
+	}
+
+	dprintk("module installed\n");
+
+	printk(KERN_INFO "v4l2loopback driver version %d.%d.%d%s loaded\n",
+	       // clang-format off
+	       (V4L2LOOPBACK_VERSION_CODE >> 16) & 0xff,
+	       (V4L2LOOPBACK_VERSION_CODE >>  8) & 0xff,
+	       (V4L2LOOPBACK_VERSION_CODE      ) & 0xff,
+#ifdef SNAPSHOT_VERSION
+	       " (" __stringify(SNAPSHOT_VERSION) ")"
+#else
+	       ""
+#endif
+	       );
+	// clang-format on
+
+	return 0;
+error:
+	misc_deregister(&v4l2loopback_misc);
+	return err;
+}
+
+static void v4l2loopback_cleanup_module(void)
+{
+	MARK();
+	/* unregister the device -> it deletes /dev/video* */
+	free_devices();
+	/* and get rid of /dev/v4l2loopback */
+	misc_deregister(&v4l2loopback_misc);
+	dprintk("module removed\n");
+}
+
+MODULE_ALIAS_MISCDEV(MISC_DYNAMIC_MINOR);
+
+module_init(v4l2loopback_init_module);
+module_exit(v4l2loopback_cleanup_module);
diff --git a/drivers/media/v4l2-core/v4l2loopback.h b/drivers/media/v4l2-core/v4l2loopback.h
new file mode 100644
index 000000000000..1bc7e6b747a4
--- /dev/null
+++ b/drivers/media/v4l2-core/v4l2loopback.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * v4l2loopback.h
+ *
+ * Written by IOhannes m zmölnig, 7/1/20.
+ *
+ * Copyright 2020 by IOhannes m zmölnig.  Redistribution of this file is
+ * permitted under the GNU General Public License.
+ */
+#ifndef _V4L2LOOPBACK_H
+#define _V4L2LOOPBACK_H
+
+#define V4L2LOOPBACK_VERSION_MAJOR 0
+#define V4L2LOOPBACK_VERSION_MINOR 13
+#define V4L2LOOPBACK_VERSION_BUGFIX 1
+
+/* /dev/v4l2loopback interface */
+
+struct v4l2_loopback_config {
+	/**
+         * the device-number (/dev/video<nr>)
+         * V4L2LOOPBACK_CTL_ADD:
+         * setting this to a value<0, will allocate an available one
+         * if nr>=0 and the device already exists, the ioctl will EEXIST
+         * if output_nr and capture_nr are the same, only a single device will be created
+	 * NOTE: currently split-devices (where output_nr and capture_nr differ)
+	 *   are not implemented yet.
+	 *   until then, requesting different device-IDs will result in EINVAL.
+         *
+         * V4L2LOOPBACK_CTL_QUERY:
+         * either both output_nr and capture_nr must refer to the same loopback,
+         * or one (and only one) of them must be -1
+         *
+         */
+	int output_nr;
+	int unused; /*capture_nr;*/
+
+	/**
+         * a nice name for your device
+         * if (*card_label)==0, an automatic name is assigned
+         */
+	char card_label[32];
+
+	/**
+         * allowed frame size
+         * if too low, default values are used
+         */
+	unsigned int min_width;
+	unsigned int max_width;
+	unsigned int min_height;
+	unsigned int max_height;
+
+	/**
+         * number of buffers to allocate for the queue
+         * if set to <=0, default values are used
+         */
+	int max_buffers;
+
+	/**
+         * how many consumers are allowed to open this device concurrently
+         * if set to <=0, default values are used
+         */
+	int max_openers;
+
+	/**
+         * set the debugging level for this device
+         */
+	int debug;
+
+	/**
+         * whether to announce OUTPUT/CAPTURE capabilities exclusively
+         * for this device or not
+         * (!exclusive_caps)
+	 * NOTE: this is going to be removed once separate output/capture
+	 *       devices are implemented
+         */
+	int announce_all_caps;
+};
+
+/* a pointer to a (struct v4l2_loopback_config) that has all values you wish to impose on the
+ * to-be-created device set.
+ * if the ptr is NULL, a new device is created with default values at the driver's discretion.
+ *
+ * returns the device_nr of the OUTPUT device (which can be used with V4L2LOOPBACK_CTL_QUERY,
+ * to get more information on the device)
+ */
+#define V4L2LOOPBACK_CTL_ADD 0x4C80
+
+/* a pointer to a (struct v4l2_loopback_config) that has output_nr and/or capture_nr set
+ * (the two values must either refer to video-devices associated with the same loopback device
+ *  or exactly one of them must be <0
+ */
+#define V4L2LOOPBACK_CTL_QUERY 0x4C82
+
+/* the device-number (either CAPTURE or OUTPUT) associated with the loopback-device */
+#define V4L2LOOPBACK_CTL_REMOVE 0x4C81
+
+#endif /* _V4L2LOOPBACK_H */
diff --git a/drivers/media/v4l2-core/v4l2loopback_formats.h b/drivers/media/v4l2-core/v4l2loopback_formats.h
new file mode 100644
index 000000000000..d855a3796554
--- /dev/null
+++ b/drivers/media/v4l2-core/v4l2loopback_formats.h
@@ -0,0 +1,445 @@
+static const struct v4l2l_format formats[] = {
+#ifndef V4L2_PIX_FMT_VP9
+#define V4L2_PIX_FMT_VP9 v4l2_fourcc('V', 'P', '9', '0')
+#endif
+#ifndef V4L2_PIX_FMT_HEVC
+#define V4L2_PIX_FMT_HEVC v4l2_fourcc('H', 'E', 'V', 'C')
+#endif
+
+	/* here come the packed formats */
+	{
+		.name = "32 bpp RGB, le",
+		.fourcc = V4L2_PIX_FMT_BGR32,
+		.depth = 32,
+		.flags = 0,
+	},
+	{
+		.name = "32 bpp RGB, be",
+		.fourcc = V4L2_PIX_FMT_RGB32,
+		.depth = 32,
+		.flags = 0,
+	},
+	{
+		.name = "24 bpp RGB, le",
+		.fourcc = V4L2_PIX_FMT_BGR24,
+		.depth = 24,
+		.flags = 0,
+	},
+	{
+		.name = "24 bpp RGB, be",
+		.fourcc = V4L2_PIX_FMT_RGB24,
+		.depth = 24,
+		.flags = 0,
+	},
+#ifdef V4L2_PIX_FMT_ABGR32
+	{
+		.name = "32 bpp RGBA, le",
+		.fourcc = V4L2_PIX_FMT_ABGR32,
+		.depth = 32,
+		.flags = 0,
+	},
+#endif
+#ifdef V4L2_PIX_FMT_RGBA32
+	{
+		.name = "32 bpp RGBA",
+		.fourcc = V4L2_PIX_FMT_RGBA32,
+		.depth = 32,
+		.flags = 0,
+	},
+#endif
+#ifdef V4L2_PIX_FMT_RGB332
+	{
+		.name = "8 bpp RGB-3-3-2",
+		.fourcc = V4L2_PIX_FMT_RGB332,
+		.depth = 8,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_RGB332 */
+#ifdef V4L2_PIX_FMT_RGB444
+	{
+		.name = "16 bpp RGB (xxxxrrrr ggggbbbb)",
+		.fourcc = V4L2_PIX_FMT_RGB444,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_RGB444 */
+#ifdef V4L2_PIX_FMT_RGB555
+	{
+		.name = "16 bpp RGB-5-5-5",
+		.fourcc = V4L2_PIX_FMT_RGB555,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_RGB555 */
+#ifdef V4L2_PIX_FMT_RGB565
+	{
+		.name = "16 bpp RGB-5-6-5",
+		.fourcc = V4L2_PIX_FMT_RGB565,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_RGB565 */
+#ifdef V4L2_PIX_FMT_RGB555X
+	{
+		.name = "16 bpp RGB-5-5-5 BE",
+		.fourcc = V4L2_PIX_FMT_RGB555X,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_RGB555X */
+#ifdef V4L2_PIX_FMT_RGB565X
+	{
+		.name = "16 bpp RGB-5-6-5 BE",
+		.fourcc = V4L2_PIX_FMT_RGB565X,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_RGB565X */
+#ifdef V4L2_PIX_FMT_BGR666
+	{
+		.name = "18 bpp BGR-6-6-6",
+		.fourcc = V4L2_PIX_FMT_BGR666,
+		.depth = 18,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_BGR666 */
+	{
+		.name = "4:2:2, packed, YUYV",
+		.fourcc = V4L2_PIX_FMT_YUYV,
+		.depth = 16,
+		.flags = 0,
+	},
+	{
+		.name = "4:2:2, packed, UYVY",
+		.fourcc = V4L2_PIX_FMT_UYVY,
+		.depth = 16,
+		.flags = 0,
+	},
+#ifdef V4L2_PIX_FMT_YVYU
+	{
+		.name = "4:2:2, packed YVYU",
+		.fourcc = V4L2_PIX_FMT_YVYU,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif
+#ifdef V4L2_PIX_FMT_VYUY
+	{
+		.name = "4:2:2, packed VYUY",
+		.fourcc = V4L2_PIX_FMT_VYUY,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif
+	{
+		.name = "4:2:2, packed YYUV",
+		.fourcc = V4L2_PIX_FMT_YYUV,
+		.depth = 16,
+		.flags = 0,
+	},
+	{
+		.name = "YUV-8-8-8-8",
+		.fourcc = V4L2_PIX_FMT_YUV32,
+		.depth = 32,
+		.flags = 0,
+	},
+	{
+		.name = "8 bpp, Greyscale",
+		.fourcc = V4L2_PIX_FMT_GREY,
+		.depth = 8,
+		.flags = 0,
+	},
+#ifdef V4L2_PIX_FMT_Y4
+	{
+		.name = "4 bpp Greyscale",
+		.fourcc = V4L2_PIX_FMT_Y4,
+		.depth = 4,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_Y4 */
+#ifdef V4L2_PIX_FMT_Y6
+	{
+		.name = "6 bpp Greyscale",
+		.fourcc = V4L2_PIX_FMT_Y6,
+		.depth = 6,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_Y6 */
+#ifdef V4L2_PIX_FMT_Y10
+	{
+		.name = "10 bpp Greyscale",
+		.fourcc = V4L2_PIX_FMT_Y10,
+		.depth = 10,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_Y10 */
+#ifdef V4L2_PIX_FMT_Y12
+	{
+		.name = "12 bpp Greyscale",
+		.fourcc = V4L2_PIX_FMT_Y12,
+		.depth = 12,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_Y12 */
+	{
+		.name = "16 bpp, Greyscale",
+		.fourcc = V4L2_PIX_FMT_Y16,
+		.depth = 16,
+		.flags = 0,
+	},
+#ifdef V4L2_PIX_FMT_YUV444
+	{
+		.name = "16 bpp xxxxyyyy uuuuvvvv",
+		.fourcc = V4L2_PIX_FMT_YUV444,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_YUV444 */
+#ifdef V4L2_PIX_FMT_YUV555
+	{
+		.name = "16 bpp YUV-5-5-5",
+		.fourcc = V4L2_PIX_FMT_YUV555,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_YUV555 */
+#ifdef V4L2_PIX_FMT_YUV565
+	{
+		.name = "16 bpp YUV-5-6-5",
+		.fourcc = V4L2_PIX_FMT_YUV565,
+		.depth = 16,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_YUV565 */
+
+/* bayer formats */
+#ifdef V4L2_PIX_FMT_SRGGB8
+	{
+		.name = "Bayer RGGB 8bit",
+		.fourcc = V4L2_PIX_FMT_SRGGB8,
+		.depth = 8,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_SRGGB8 */
+#ifdef V4L2_PIX_FMT_SGRBG8
+	{
+		.name = "Bayer GRBG 8bit",
+		.fourcc = V4L2_PIX_FMT_SGRBG8,
+		.depth = 8,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_SGRBG8 */
+#ifdef V4L2_PIX_FMT_SGBRG8
+	{
+		.name = "Bayer GBRG 8bit",
+		.fourcc = V4L2_PIX_FMT_SGBRG8,
+		.depth = 8,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_SGBRG8 */
+#ifdef V4L2_PIX_FMT_SBGGR8
+	{
+		.name = "Bayer BA81 8bit",
+		.fourcc = V4L2_PIX_FMT_SBGGR8,
+		.depth = 8,
+		.flags = 0,
+	},
+#endif /* V4L2_PIX_FMT_SBGGR8 */
+
+	/* here come the planar formats */
+	{
+		.name = "4:1:0, planar, Y-Cr-Cb",
+		.fourcc = V4L2_PIX_FMT_YVU410,
+		.depth = 9,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+	{
+		.name = "4:2:0, planar, Y-Cr-Cb",
+		.fourcc = V4L2_PIX_FMT_YVU420,
+		.depth = 12,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+	{
+		.name = "4:1:0, planar, Y-Cb-Cr",
+		.fourcc = V4L2_PIX_FMT_YUV410,
+		.depth = 9,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+	{
+		.name = "4:2:0, planar, Y-Cb-Cr",
+		.fourcc = V4L2_PIX_FMT_YUV420,
+		.depth = 12,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+#ifdef V4L2_PIX_FMT_YUV422P
+	{
+		.name = "16 bpp YVU422 planar",
+		.fourcc = V4L2_PIX_FMT_YUV422P,
+		.depth = 16,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+#endif /* V4L2_PIX_FMT_YUV422P */
+#ifdef V4L2_PIX_FMT_YUV411P
+	{
+		.name = "16 bpp YVU411 planar",
+		.fourcc = V4L2_PIX_FMT_YUV411P,
+		.depth = 16,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+#endif /* V4L2_PIX_FMT_YUV411P */
+#ifdef V4L2_PIX_FMT_Y41P
+	{
+		.name = "12 bpp YUV 4:1:1",
+		.fourcc = V4L2_PIX_FMT_Y41P,
+		.depth = 12,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+#endif /* V4L2_PIX_FMT_Y41P */
+#ifdef V4L2_PIX_FMT_NV12
+	{
+		.name = "12 bpp Y/CbCr 4:2:0 ",
+		.fourcc = V4L2_PIX_FMT_NV12,
+		.depth = 12,
+		.flags = FORMAT_FLAGS_PLANAR,
+	},
+#endif /* V4L2_PIX_FMT_NV12 */
+
+/* here come the compressed formats */
+
+#ifdef V4L2_PIX_FMT_MJPEG
+	{
+		.name = "Motion-JPEG",
+		.fourcc = V4L2_PIX_FMT_MJPEG,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_MJPEG */
+#ifdef V4L2_PIX_FMT_JPEG
+	{
+		.name = "JFIF JPEG",
+		.fourcc = V4L2_PIX_FMT_JPEG,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_JPEG */
+#ifdef V4L2_PIX_FMT_DV
+	{
+		.name = "DV1394",
+		.fourcc = V4L2_PIX_FMT_DV,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_DV */
+#ifdef V4L2_PIX_FMT_MPEG
+	{
+		.name = "MPEG-1/2/4 Multiplexed",
+		.fourcc = V4L2_PIX_FMT_MPEG,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_MPEG */
+#ifdef V4L2_PIX_FMT_H264
+	{
+		.name = "H264 with start codes",
+		.fourcc = V4L2_PIX_FMT_H264,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_H264 */
+#ifdef V4L2_PIX_FMT_H264_NO_SC
+	{
+		.name = "H264 without start codes",
+		.fourcc = V4L2_PIX_FMT_H264_NO_SC,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_H264_NO_SC */
+#ifdef V4L2_PIX_FMT_H264_MVC
+	{
+		.name = "H264 MVC",
+		.fourcc = V4L2_PIX_FMT_H264_MVC,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_H264_MVC */
+#ifdef V4L2_PIX_FMT_H263
+	{
+		.name = "H263",
+		.fourcc = V4L2_PIX_FMT_H263,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_H263 */
+#ifdef V4L2_PIX_FMT_MPEG1
+	{
+		.name = "MPEG-1 ES",
+		.fourcc = V4L2_PIX_FMT_MPEG1,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_MPEG1 */
+#ifdef V4L2_PIX_FMT_MPEG2
+	{
+		.name = "MPEG-2 ES",
+		.fourcc = V4L2_PIX_FMT_MPEG2,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_MPEG2 */
+#ifdef V4L2_PIX_FMT_MPEG4
+	{
+		.name = "MPEG-4 part 2 ES",
+		.fourcc = V4L2_PIX_FMT_MPEG4,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_MPEG4 */
+#ifdef V4L2_PIX_FMT_XVID
+	{
+		.name = "Xvid",
+		.fourcc = V4L2_PIX_FMT_XVID,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_XVID */
+#ifdef V4L2_PIX_FMT_VC1_ANNEX_G
+	{
+		.name = "SMPTE 421M Annex G compliant stream",
+		.fourcc = V4L2_PIX_FMT_VC1_ANNEX_G,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_VC1_ANNEX_G */
+#ifdef V4L2_PIX_FMT_VC1_ANNEX_L
+	{
+		.name = "SMPTE 421M Annex L compliant stream",
+		.fourcc = V4L2_PIX_FMT_VC1_ANNEX_L,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_VC1_ANNEX_L */
+#ifdef V4L2_PIX_FMT_VP8
+	{
+		.name = "VP8",
+		.fourcc = V4L2_PIX_FMT_VP8,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_VP8 */
+#ifdef V4L2_PIX_FMT_VP9
+	{
+		.name = "VP9",
+		.fourcc = V4L2_PIX_FMT_VP9,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_VP9 */
+#ifdef V4L2_PIX_FMT_HEVC
+	{
+		.name = "HEVC",
+		.fourcc = V4L2_PIX_FMT_HEVC,
+		.depth = 32,
+		.flags = FORMAT_FLAGS_COMPRESSED,
+	},
+#endif /* V4L2_PIX_FMT_HEVC */
+};
diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
index f2b19e6174af..4fef4b174321 100644
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -1,4 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
+ifdef CONFIG_X86_64
+ifdef CONFIG_SATA_AHCI
+obj-y += intel-nvme-remap.o
+endif
+endif
+
 obj-$(CONFIG_PCIE_CADENCE) += cadence/
 obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
 obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o
diff --git a/drivers/pci/controller/intel-nvme-remap.c b/drivers/pci/controller/intel-nvme-remap.c
new file mode 100644
index 000000000000..e105e6f5cc91
--- /dev/null
+++ b/drivers/pci/controller/intel-nvme-remap.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel remapped NVMe device support.
+ *
+ * Copyright (c) 2019 Endless Mobile, Inc.
+ * Author: Daniel Drake <drake@endlessm.com>
+ *
+ * Some products ship by default with the SATA controller in "RAID" or
+ * "Intel RST Premium With Intel Optane System Acceleration" mode. Under this
+ * mode, which we refer to as "remapped NVMe" mode, any installed NVMe
+ * devices disappear from the PCI bus, and instead their I/O memory becomes
+ * available within the AHCI device BARs.
+ *
+ * This scheme is understood to be a way of avoiding usage of the standard
+ * Windows NVMe driver under that OS, instead mandating usage of Intel's
+ * driver instead, which has better power management, and presumably offers
+ * some RAID/disk-caching solutions too.
+ *
+ * Here in this driver, we support the remapped NVMe mode by claiming the
+ * AHCI device and creating a fake PCIe root port. On the new bus, the
+ * original AHCI device is exposed with only minor tweaks. Then, fake PCI
+ * devices corresponding to the remapped NVMe devices are created. The usual
+ * ahci and nvme drivers are then expected to bind to these devices and
+ * operate as normal.
+ *
+ * The PCI configuration space for the NVMe devices is completely
+ * unavailable, so we fake a minimal one and hope for the best.
+ *
+ * Interrupts are shared between the AHCI and NVMe devices. For simplicity,
+ * we only support the legacy interrupt here, although MSI support
+ * could potentially be added later.
+ */
+
+#define MODULE_NAME "intel-nvme-remap"
+
+#include <linux/ahci-remap.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#define AHCI_PCI_BAR_STANDARD 5
+
+struct nvme_remap_dev {
+	struct pci_dev		*dev;		/* AHCI device */
+	struct pci_bus		*bus;		/* our fake PCI bus */
+	struct pci_sysdata	sysdata;
+	int			irq_base;	/* our fake interrupts */
+
+	/*
+	 * When we detect an all-ones write to a BAR register, this flag
+	 * is set, so that we return the BAR size on the next read (a
+	 * standard PCI behaviour).
+	 * This includes the assumption that an all-ones BAR write is
+	 * immediately followed by a read of the same register.
+	 */
+	bool			bar_sizing;
+
+	/*
+	 * Resources copied from the AHCI device, to be regarded as
+	 * resources on our fake bus.
+	 */
+	struct resource		ahci_resources[PCI_NUM_RESOURCES];
+
+	/* Resources corresponding to the NVMe devices. */
+	struct resource		remapped_dev_mem[AHCI_MAX_REMAP];
+
+	/* Number of remapped NVMe devices found. */
+	int			num_remapped_devices;
+};
+
+static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus)
+{
+	return container_of(bus->sysdata, struct nvme_remap_dev, sysdata);
+}
+
+
+/******** PCI configuration space **********/
+
+/*
+ * Helper macros for tweaking returned contents of PCI configuration space.
+ *
+ * value contains len bytes of data read from reg.
+ * If fixup_reg is included in that range, fix up the contents of that
+ * register to fixed_value.
+ */
+#define NR_FIX8(fixup_reg, fixed_value) do { \
+		if (reg <= fixup_reg && fixup_reg < reg + len) \
+			((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \
+	} while (0)
+
+#define NR_FIX16(fixup_reg, fixed_value) do { \
+		NR_FIX8(fixup_reg, fixed_value); \
+		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
+	} while (0)
+
+#define NR_FIX24(fixup_reg, fixed_value) do { \
+		NR_FIX8(fixup_reg, fixed_value); \
+		NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
+		NR_FIX8(fixup_reg + 2, fixed_value >> 16); \
+	} while (0)
+
+#define NR_FIX32(fixup_reg, fixed_value) do { \
+		NR_FIX16(fixup_reg, (u16) fixed_value); \
+		NR_FIX16(fixup_reg + 2, fixed_value >> 16); \
+	} while (0)
+
+/*
+ * Read PCI config space of the slot 0 (AHCI) device.
+ * We pass through the read request to the underlying device, but
+ * tweak the results in some cases.
+ */
+static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg,
+				     int len, u32 *value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
+	int ret;
+
+	ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn,
+				      reg, len, value);
+	if (ret)
+		return ret;
+
+	/*
+	 * Adjust the device class, to prevent this driver from attempting to
+	 * additionally probe the device we're simulating here.
+	 */
+	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI);
+
+	/*
+	 * Unset interrupt pin, otherwise ACPI tries to find routing
+	 * info for our virtual IRQ, fails, and complains.
+	 */
+	NR_FIX8(PCI_INTERRUPT_PIN, 0);
+
+	/*
+	 * Truncate the AHCI BAR to not include the region that covers the
+	 * hidden devices. This will cause the ahci driver to successfully
+	 * probe th new device (instead of handing it over to this driver).
+	 */
+	if (nrdev->bar_sizing) {
+		NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1));
+		nrdev->bar_sizing = false;
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+/*
+ * Read PCI config space of a remapped device.
+ * Since the original PCI config space is inaccessible, we provide a minimal,
+ * fake config space instead.
+ */
+static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port,
+					int reg, int len, u32 *value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct resource *remapped_mem;
+
+	if (port > nrdev->num_remapped_devices)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	*value = 0;
+	remapped_mem = &nrdev->remapped_dev_mem[port - 1];
+
+	/* Set a Vendor ID, otherwise Linux assumes no device is present */
+	NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL);
+
+	/* Always appear on & bus mastering */
+	NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
+
+	/* Set class so that nvme driver probes us */
+	NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS);
+
+	if (nrdev->bar_sizing) {
+		NR_FIX32(PCI_BASE_ADDRESS_0,
+			 ~(resource_size(remapped_mem) - 1));
+		nrdev->bar_sizing = false;
+	} else {
+		resource_size_t mem_start = remapped_mem->start;
+
+		mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+		NR_FIX32(PCI_BASE_ADDRESS_0, mem_start);
+		mem_start >>= 32;
+		NR_FIX32(PCI_BASE_ADDRESS_1, mem_start);
+	}
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+/* Read PCI configuration space. */
+static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn,
+			       int reg, int len, u32 *value)
+{
+	if (PCI_SLOT(devfn) == 0)
+		return nvme_remap_pci_read_slot0(bus, reg, len, value);
+	else
+		return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn),
+						    reg, len, value);
+}
+
+/*
+ * Write PCI config space of the slot 0 (AHCI) device.
+ * Apart from the special case of BAR sizing, we disable all writes.
+ * Otherwise, the ahci driver could make changes (e.g. unset PCI bus master)
+ * that would affect the operation of the NVMe devices.
+ */
+static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg,
+				      int len, u32 value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+	struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
+
+	if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) {
+		/*
+		 * Writing all-ones to a BAR means that the size of the
+		 * memory region is being checked. Flag this so that we can
+		 * reply with an appropriate size on the next read.
+		 */
+		if (value == ~0)
+			nrdev->bar_sizing = true;
+
+		return ahci_dev_bus->ops->write(ahci_dev_bus,
+						nrdev->dev->devfn,
+						reg, len, value);
+	}
+
+	return PCIBIOS_SET_FAILED;
+}
+
+/*
+ * Write PCI config space of a remapped device.
+ * Since the original PCI config space is inaccessible, we reject all
+ * writes, except for the special case of BAR probing.
+ */
+static int nvme_remap_pci_write_remapped(struct pci_bus *bus,
+					 unsigned int port,
+					 int reg, int len, u32 value)
+{
+	struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+
+	if (port > nrdev->num_remapped_devices)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	/*
+	 * Writing all-ones to a BAR means that the size of the memory
+	 * region is being checked. Flag this so that we can reply with
+	 * an appropriate size on the next read.
+	 */
+	if (value == ~0 && reg >= PCI_BASE_ADDRESS_0
+			&& reg <= PCI_BASE_ADDRESS_5) {
+		nrdev->bar_sizing = true;
+		return PCIBIOS_SUCCESSFUL;
+	}
+
+	return PCIBIOS_SET_FAILED;
+}
+
+/* Write PCI configuration space. */
+static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn,
+				int reg, int len, u32 value)
+{
+	if (PCI_SLOT(devfn) == 0)
+		return nvme_remap_pci_write_slot0(bus, reg, len, value);
+	else
+		return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn),
+						     reg, len, value);
+}
+
+static struct pci_ops nvme_remap_pci_ops = {
+	.read	= nvme_remap_pci_read,
+	.write	= nvme_remap_pci_write,
+};
+
+
+/******** Initialization & exit **********/
+
+/*
+ * Find a PCI domain ID to use for our fake bus.
+ * Start at 0x10000 to not clash with ACPI _SEG domains (16 bits).
+ */
+static int find_free_domain(void)
+{
+	int domain = 0xffff;
+	struct pci_bus *bus = NULL;
+
+	while ((bus = pci_find_next_bus(bus)) != NULL)
+		domain = max_t(int, domain, pci_domain_nr(bus));
+
+	return domain + 1;
+}
+
+static int find_remapped_devices(struct nvme_remap_dev *nrdev,
+				 struct list_head *resources)
+{
+	void __iomem *mmio;
+	int i, count = 0;
+	u32 cap;
+
+	mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD,
+			  pci_resource_len(nrdev->dev,
+					   AHCI_PCI_BAR_STANDARD));
+	if (!mmio)
+		return -ENODEV;
+
+	/* Check if this device might have remapped nvme devices. */
+	if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K ||
+	    !(readl(mmio + AHCI_VSCAP) & 1))
+		return -ENODEV;
+
+	cap = readq(mmio + AHCI_REMAP_CAP);
+	for (i = AHCI_MAX_REMAP-1; i >= 0; i--) {
+		struct resource *remapped_mem;
+
+		if ((cap & (1 << i)) == 0)
+			continue;
+		if (readl(mmio + ahci_remap_dcc(i))
+				!= PCI_CLASS_STORAGE_EXPRESS)
+			continue;
+
+		/* We've found a remapped device */
+		remapped_mem = &nrdev->remapped_dev_mem[count++];
+		remapped_mem->start =
+			pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD)
+			+ ahci_remap_base(i);
+		remapped_mem->end = remapped_mem->start
+			+ AHCI_REMAP_N_SIZE - 1;
+		remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED;
+		pci_add_resource(resources, remapped_mem);
+	}
+
+	pcim_iounmap(nrdev->dev, mmio);
+
+	if (count == 0)
+		return -ENODEV;
+
+	nrdev->num_remapped_devices = count;
+	dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n",
+		 nrdev->num_remapped_devices);
+	return 0;
+}
+
+static void nvme_remap_remove_root_bus(void *data)
+{
+	struct pci_bus *bus = data;
+
+	pci_stop_root_bus(bus);
+	pci_remove_root_bus(bus);
+}
+
+static int nvme_remap_probe(struct pci_dev *dev,
+			    const struct pci_device_id *id)
+{
+	struct nvme_remap_dev *nrdev;
+	LIST_HEAD(resources);
+	int i;
+	int ret;
+	struct pci_dev *child;
+
+	nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL);
+	nrdev->sysdata.domain = find_free_domain();
+	nrdev->sysdata.nvme_remap_dev = dev;
+	nrdev->dev = dev;
+	pci_set_drvdata(dev, nrdev);
+
+	ret = pcim_enable_device(dev);
+	if (ret < 0)
+		return ret;
+
+	pci_set_master(dev);
+
+	ret = find_remapped_devices(nrdev, &resources);
+	if (ret)
+		return ret;
+
+	/* Add resources from the original AHCI device */
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		struct resource *res = &dev->resource[i];
+
+		if (res->start) {
+			struct resource *nr_res = &nrdev->ahci_resources[i];
+
+			nr_res->start = res->start;
+			nr_res->end = res->end;
+			nr_res->flags = res->flags;
+			pci_add_resource(&resources, nr_res);
+		}
+	}
+
+	/* Create virtual interrupts */
+	nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0,
+					       nrdev->num_remapped_devices + 1,
+					       0);
+	if (nrdev->irq_base < 0)
+		return nrdev->irq_base;
+
+	/* Create and populate PCI bus */
+	nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops,
+					 &nrdev->sysdata, &resources);
+	if (!nrdev->bus)
+		return -ENODEV;
+
+	if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus,
+				     nrdev->bus))
+		return -ENOMEM;
+
+	/* We don't support sharing MSI interrupts between these devices */
+	nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
+
+	pci_scan_child_bus(nrdev->bus);
+
+	list_for_each_entry(child, &nrdev->bus->devices, bus_list) {
+		/*
+		 * Prevent PCI core from trying to move memory BARs around.
+		 * The hidden NVMe devices are at fixed locations.
+		 */
+		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+			struct resource *res = &child->resource[i];
+
+			if (res->flags & IORESOURCE_MEM)
+				res->flags |= IORESOURCE_PCI_FIXED;
+		}
+
+		/* Share the legacy IRQ between all devices */
+		child->irq = dev->irq;
+	}
+
+	pci_assign_unassigned_bus_resources(nrdev->bus);
+	pci_bus_add_devices(nrdev->bus);
+
+	return 0;
+}
+
+static const struct pci_device_id nvme_remap_ids[] = {
+	/*
+	 * Match all Intel RAID controllers.
+	 *
+	 * There's overlap here with the set of devices detected by the ahci
+	 * driver, but ahci will only successfully probe when there
+	 * *aren't* any remapped NVMe devices, and this driver will only
+	 * successfully probe when there *are* remapped NVMe devices that
+	 * need handling.
+	 */
+	{
+		PCI_VDEVICE(INTEL, PCI_ANY_ID),
+		.class = PCI_CLASS_STORAGE_RAID << 8,
+		.class_mask = 0xffffff00,
+	},
+	{0,}
+};
+MODULE_DEVICE_TABLE(pci, nvme_remap_ids);
+
+static struct pci_driver nvme_remap_drv = {
+	.name		= MODULE_NAME,
+	.id_table	= nvme_remap_ids,
+	.probe		= nvme_remap_probe,
+};
+module_pci_driver(nvme_remap_drv);
+
+MODULE_AUTHOR("Daniel Drake <drake@endlessm.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 568410e64ce6..192d0557fb05 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3732,6 +3732,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev)
 	dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET;
 }
 
+static bool acs_on_downstream;
+static bool acs_on_multifunction;
+
+#define NUM_ACS_IDS 16
+struct acs_on_id {
+	unsigned short vendor;
+	unsigned short device;
+};
+static struct acs_on_id acs_on_ids[NUM_ACS_IDS];
+static u8 max_acs_id;
+
+static __init int pcie_acs_override_setup(char *p)
+{
+	if (!p)
+		return -EINVAL;
+
+	while (*p) {
+		if (!strncmp(p, "downstream", 10))
+			acs_on_downstream = true;
+		if (!strncmp(p, "multifunction", 13))
+			acs_on_multifunction = true;
+		if (!strncmp(p, "id:", 3)) {
+			char opt[5];
+			int ret;
+			long val;
+
+			if (max_acs_id >= NUM_ACS_IDS - 1) {
+				pr_warn("Out of PCIe ACS override slots (%d)\n",
+						NUM_ACS_IDS);
+				goto next;
+			}
+
+			p += 3;
+			snprintf(opt, 5, "%s", p);
+			ret = kstrtol(opt, 16, &val);
+			if (ret) {
+				pr_warn("PCIe ACS ID parse error %d\n", ret);
+				goto next;
+			}
+			acs_on_ids[max_acs_id].vendor = val;
+
+			p += strcspn(p, ":");
+			if (*p != ':') {
+				pr_warn("PCIe ACS invalid ID\n");
+				goto next;
+			}
+
+			p++;
+			snprintf(opt, 5, "%s", p);
+			ret = kstrtol(opt, 16, &val);
+			if (ret) {
+				pr_warn("PCIe ACS ID parse error %d\n", ret);
+				goto next;
+			}
+			acs_on_ids[max_acs_id].device = val;
+			max_acs_id++;
+		}
+next:
+		p += strcspn(p, ",");
+		if (*p == ',')
+			p++;
+	}
+
+	if (acs_on_downstream || acs_on_multifunction || max_acs_id)
+		pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n");
+
+	return 0;
+}
+early_param("pcie_acs_override", pcie_acs_override_setup);
+
+static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags)
+{
+	int i;
+
+	/* Never override ACS for legacy devices or devices with ACS caps */
+	if (!pci_is_pcie(dev) ||
+		pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS))
+			return -ENOTTY;
+
+	for (i = 0; i < max_acs_id; i++)
+		if (acs_on_ids[i].vendor == dev->vendor &&
+			acs_on_ids[i].device == dev->device)
+				return 1;
+
+	switch (pci_pcie_type(dev)) {
+	case PCI_EXP_TYPE_DOWNSTREAM:
+	case PCI_EXP_TYPE_ROOT_PORT:
+		if (acs_on_downstream)
+			return 1;
+		break;
+	case PCI_EXP_TYPE_ENDPOINT:
+	case PCI_EXP_TYPE_UPSTREAM:
+	case PCI_EXP_TYPE_LEG_END:
+	case PCI_EXP_TYPE_RC_END:
+		if (acs_on_multifunction && dev->multifunction)
+			return 1;
+	}
+
+	return -ENOTTY;
+}
 /*
  * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be
  * prevented for those affected devices.
@@ -5143,6 +5243,7 @@ static const struct pci_dev_acs_enabled {
 	{ PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs },
 	/* Wangxun nics */
 	{ PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs },
+	{ PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides },
 	{ 0 }
 };
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 20f7e98ee8af..0f5aad20ced7 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -577,12 +577,6 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div,
 #define CPUFREQ_POLICY_POWERSAVE	(1)
 #define CPUFREQ_POLICY_PERFORMANCE	(2)
 
-/*
- * The polling frequency depends on the capability of the processor. Default
- * polling frequency is 1000 times the transition latency of the processor.
- */
-#define LATENCY_MULTIPLIER		(1000)
-
 struct cpufreq_governor {
 	char	name[CPUFREQ_NAME_LEN];
 	int	(*init)(struct cpufreq_policy *policy);
diff --git a/include/linux/minmax.h b/include/linux/minmax.h
index 2ec559284a9f..a7ef65f78933 100644
--- a/include/linux/minmax.h
+++ b/include/linux/minmax.h
@@ -270,4 +270,11 @@ static inline bool in_range32(u32 val, u32 start, u32 len)
 #define swap(a, b) \
 	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
 
+/*
+ * Use these carefully: no type checking, and uses the arguments
+ * multiple times. Use for obvious constants only.
+ */
+#define MIN_T(type,a,b) __cmp(min,(type)(a),(type)(b))
+#define MAX_T(type,a,b) __cmp(max,(type)(a),(type)(b))
+
 #endif	/* _LINUX_MINMAX_H */
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 547e82cdc89a..fc6b9c87cb0a 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -41,13 +41,13 @@ extern unsigned int pageblock_order;
  * Huge pages are a constant size, but don't exceed the maximum allocation
  * granularity.
  */
-#define pageblock_order		min_t(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER)
+#define pageblock_order		MIN_T(unsigned int, HUGETLB_PAGE_ORDER, MAX_PAGE_ORDER)
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
 #elif defined(CONFIG_TRANSPARENT_HUGEPAGE)
 
-#define pageblock_order		min_t(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER)
+#define pageblock_order		MIN_T(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER)
 
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a0a026d2d244..8bece21a8998 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1281,7 +1281,7 @@ struct readahead_control {
 		._index = i,						\
 	}
 
-#define VM_READAHEAD_PAGES	(SZ_128K / PAGE_SIZE)
+#define VM_READAHEAD_PAGES	(SZ_8M / PAGE_SIZE)
 
 void page_cache_ra_unbounded(struct readahead_control *,
 		unsigned long nr_to_read, unsigned long lookahead_count);
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 6030a8235617..60b7fe5fa74a 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -156,6 +156,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
 
 #ifdef CONFIG_USER_NS
 
+extern int unprivileged_userns_clone;
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	if (ns)
@@ -189,6 +191,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns);
 struct ns_common *ns_get_owner(struct ns_common *ns);
 #else
 
+#define unprivileged_userns_clone 0
+
 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
 {
 	return &init_user_ns;
diff --git a/init/Kconfig b/init/Kconfig
index 9684e5d2b81c..fdfe1928de17 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -132,6 +132,10 @@ config THREAD_INFO_IN_TASK
 
 menu "General setup"
 
+config CACHY
+    bool "Some kernel tweaks by CachyOS"
+    default y
+
 config BROKEN
 	bool
 
@@ -1251,6 +1255,22 @@ config USER_NS
 
 	  If unsure, say N.
 
+config USER_NS_UNPRIVILEGED
+	bool "Allow unprivileged users to create namespaces"
+	default y
+	depends on USER_NS
+	help
+	  When disabled, unprivileged users will not be able to create
+	  new namespaces. Allowing users to create their own namespaces
+	  has been part of several recent local privilege escalation
+	  exploits, so if you need user namespaces but are
+	  paranoid^Wsecurity-conscious you want to disable this.
+
+	  This setting can be overridden at runtime via the
+	  kernel.unprivileged_userns_clone sysctl.
+
+	  If unsure, say Y.
+
 config PID_NS
 	bool "PID Namespaces"
 	default y
@@ -1393,6 +1413,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
 	  with the "-O2" compiler flag for best performance and most
 	  helpful compile-time warnings.
 
+config CC_OPTIMIZE_FOR_PERFORMANCE_O3
+	bool "Optimize more for performance (-O3)"
+	help
+	  Choosing this option will pass "-O3" to your compiler to optimize
+	  the kernel yet more for performance.
+
 config CC_OPTIMIZE_FOR_SIZE
 	bool "Optimize for size (-Os)"
 	help
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 38ef6d06888e..0f78364efd4f 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -40,6 +40,27 @@ choice
 	 on SMP and NUMA systems and exactly dividing by both PAL and
 	 NTSC frame rates for video and multimedia work.
 
+	config HZ_500
+		bool "500 HZ"
+	help
+	 500 Hz is a balanced timer frequency. Provides fast interactivity
+	 on desktops with good smoothness without increasing CPU power
+	 consumption and sacrificing the battery life on laptops.
+
+	config HZ_600
+		bool "600 HZ"
+	help
+	 600 Hz is a balanced timer frequency. Provides fast interactivity
+	 on desktops with good smoothness without increasing CPU power
+	 consumption and sacrificing the battery life on laptops.
+
+	config HZ_750
+		bool "750 HZ"
+	help
+	 750 Hz is a balanced timer frequency. Provides fast interactivity
+	 on desktops with good smoothness without increasing CPU power
+	 consumption and sacrificing the battery life on laptops.
+
 	config HZ_1000
 		bool "1000 HZ"
 	help
@@ -53,6 +74,9 @@ config HZ
 	default 100 if HZ_100
 	default 250 if HZ_250
 	default 300 if HZ_300
+	default 500 if HZ_500
+	default 600 if HZ_600
+	default 750 if HZ_750
 	default 1000 if HZ_1000
 
 config SCHED_HRTICK
diff --git a/kernel/fork.c b/kernel/fork.c
index 99076dbe27d8..18750b83c564 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -104,6 +104,10 @@
 #include <uapi/linux/pidfd.h>
 #include <linux/pidfs.h>
 
+#ifdef CONFIG_USER_NS
+#include <linux/user_namespace.h>
+#endif
+
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -2154,6 +2158,10 @@ __latent_entropy struct task_struct *copy_process(
 	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
 		return ERR_PTR(-EINVAL);
 
+	if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone)
+		if (!capable(CAP_SYS_ADMIN))
+			return ERR_PTR(-EPERM);
+
 	/*
 	 * Thread groups must share signals as well, and detached threads
 	 * can only be started up within the thread group.
@@ -3301,6 +3309,12 @@ int ksys_unshare(unsigned long unshare_flags)
 	if (unshare_flags & CLONE_NEWNS)
 		unshare_flags |= CLONE_FS;
 
+	if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) {
+		err = -EPERM;
+		if (!capable(CAP_SYS_ADMIN))
+			goto bad_unshare_out;
+	}
+
 	err = check_unshare_flags(unshare_flags);
 	if (err)
 		goto bad_unshare_out;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 483c137b9d3d..1fee282d40aa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -73,10 +73,19 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
  *
  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
+#ifdef CONFIG_CACHY
+unsigned int sysctl_sched_base_slice			= 350000ULL;
+static unsigned int normalized_sysctl_sched_base_slice	= 350000ULL;
+#else
 unsigned int sysctl_sched_base_slice			= 750000ULL;
 static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
+#endif
 
+#ifdef CONFIG_CACHY
+const_debug unsigned int sysctl_sched_migration_cost	= 300000UL;
+#else
 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
+#endif
 
 static int __init setup_sched_thermal_decay_shift(char *str)
 {
@@ -121,8 +130,12 @@ int __weak arch_asym_cpu_priority(int cpu)
  *
  * (default: 5 msec, units: microseconds)
  */
+#ifdef CONFIG_CACHY
+static unsigned int sysctl_sched_cfs_bandwidth_slice		= 3000UL;
+#else
 static unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
 #endif
+#endif
 
 #ifdef CONFIG_NUMA_BALANCING
 /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 38aeedd8a6cc..556466836cd5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2544,7 +2544,7 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
 extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
 
-#ifdef CONFIG_PREEMPT_RT
+#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_CACHY)
 #define SCHED_NR_MIGRATE_BREAK 8
 #else
 #define SCHED_NR_MIGRATE_BREAK 32
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e0b917328cf9..e70ae9c11dea 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -80,6 +80,9 @@
 #ifdef CONFIG_RT_MUTEXES
 #include <linux/rtmutex.h>
 #endif
+#ifdef CONFIG_USER_NS
+#include <linux/user_namespace.h>
+#endif
 
 /* shared constants to be used in various sysctls */
 const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 };
@@ -1623,6 +1626,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_USER_NS
+	{
+		.procname	= "unprivileged_userns_clone",
+		.data		= &unprivileged_userns_clone,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_PROC_SYSCTL
 	{
 		.procname	= "tainted",
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 0b0b95418b16..c4b835b91fc0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -22,6 +22,13 @@
 #include <linux/bsearch.h>
 #include <linux/sort.h>
 
+/* sysctl */
+#ifdef CONFIG_USER_NS_UNPRIVILEGED
+int unprivileged_userns_clone = 1;
+#else
+int unprivileged_userns_clone;
+#endif
+
 static struct kmem_cache *user_ns_cachep __ro_after_init;
 static DEFINE_MUTEX(userns_state_mutex);
 
diff --git a/mm/Kconfig b/mm/Kconfig
index b4cb45255a54..8635b3b24739 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -613,7 +613,7 @@ config COMPACTION
 config COMPACT_UNEVICTABLE_DEFAULT
 	int
 	depends on COMPACTION
-	default 0 if PREEMPT_RT
+	default 0 if PREEMPT_RT || CACHY
 	default 1
 
 #
diff --git a/mm/compaction.c b/mm/compaction.c
index 739b1bf3d637..3a4269c02fb2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1950,7 +1950,11 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE
  * aggressively the kernel should compact memory in the
  * background. It takes values in the range [0, 100].
  */
+#ifdef CONFIG_CACHY
+static unsigned int __read_mostly sysctl_compaction_proactiveness;
+#else
 static unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
+#endif
 static int sysctl_extfrag_threshold = 500;
 static int __read_mostly sysctl_compact_memory;
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4d9c1277e5e4..8d551e3be086 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -63,7 +63,11 @@ unsigned long transparent_hugepage_flags __read_mostly =
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
 	(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
 #endif
+#ifdef CONFIG_CACHY
+	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG)|
+#else
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
+#endif
 	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8a1c92090129..a415623cde8f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -71,7 +71,11 @@ static long ratelimit_pages = 32;
 /*
  * Start background writeback (via writeback threads) at this percentage
  */
+#ifdef CONFIG_CACHY
+static int dirty_background_ratio = 5;
+#else
 static int dirty_background_ratio = 10;
+#endif
 
 /*
  * dirty_background_bytes starts at 0 (disabled) so that it is a function of
@@ -99,7 +103,11 @@ static unsigned long vm_dirty_bytes;
 /*
  * The interval between `kupdate'-style writebacks
  */
+#ifdef CONFIG_CACHY
+unsigned int dirty_writeback_interval = 10 * 100; /* centiseconds */
+#else
 unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
+#endif
 
 EXPORT_SYMBOL_GPL(dirty_writeback_interval);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 21016573f187..8afba9085a4b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -271,7 +271,11 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
 
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
+#ifdef CONFIG_CACHY
+static int watermark_boost_factor __read_mostly;
+#else
 static int watermark_boost_factor __read_mostly = 15000;
+#endif
 static int watermark_scale_factor = 10;
 
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
diff --git a/mm/swap.c b/mm/swap.c
index 67786cb77130..6a91db6f3302 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1111,6 +1111,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
  */
 void __init swap_setup(void)
 {
+#ifdef CONFIG_CACHY
+	/* Only swap-in pages requested, avoid readahead */
+	page_cluster = 0;
+#else
 	unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
 
 	/* Use a smaller cluster for small-memory machines */
@@ -1122,4 +1126,5 @@ void __init swap_setup(void)
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
+#endif
 }
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index bd5183dfd879..3a410f53a07c 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
  * essence, they are percents: the higher the value, the more number
  * unsuccessful reclaims there were.
  */
+#ifdef CONFIG_CACHY
+static const unsigned int vmpressure_level_med = 65;
+#else
 static const unsigned int vmpressure_level_med = 60;
+#endif
 static const unsigned int vmpressure_level_critical = 95;
 
 /*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 99a0c152a85e..75bd34426f48 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -191,7 +191,11 @@ struct scan_control {
 /*
  * From 0 .. 200.  Higher means more swappy.
  */
+#ifdef CONFIG_CACHY
+int vm_swappiness = 20;
+#else
 int vm_swappiness = 60;
+#endif
 
 #ifdef CONFIG_MEMCG
 
@@ -3953,7 +3957,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
 }
 
 /* to protect the working set of the last N jiffies */
+#ifdef CONFIG_CACHY
+static unsigned long lru_gen_min_ttl __read_mostly = 1000;
+#else
 static unsigned long lru_gen_min_ttl __read_mostly;
+#endif
 
 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 {
diff --git a/scripts/Makefile.package b/scripts/Makefile.package
index bf016af8bf8a..11d53f240a2b 100644
--- a/scripts/Makefile.package
+++ b/scripts/Makefile.package
@@ -141,6 +141,18 @@ snap-pkg:
 	cd $(objtree)/snap && \
 	snapcraft --target-arch=$(UTS_MACHINE)
 
+# pacman-pkg
+# ---------------------------------------------------------------------------
+
+PHONY += pacman-pkg
+pacman-pkg:
+	@ln -srf $(srctree)/scripts/package/PKGBUILD $(objtree)/PKGBUILD
+	+BUILDDIR="$(realpath $(objtree))/pacman" \
+		CARCH="$(UTS_MACHINE)" \
+		KBUILD_MAKEFLAGS="$(MAKEFLAGS)" \
+		KBUILD_REVISION="$(shell $(srctree)/scripts/build-version)" \
+		makepkg $(MAKEPKGOPTS)
+
 # dir-pkg tar*-pkg - tarball targets
 # ---------------------------------------------------------------------------
 
@@ -221,6 +233,7 @@ help:
 	@echo '  bindeb-pkg          - Build only the binary kernel deb package'
 	@echo '  snap-pkg            - Build only the binary kernel snap package'
 	@echo '                        (will connect to external hosts)'
+	@echo '  pacman-pkg          - Build only the binary kernel pacman package'
 	@echo '  dir-pkg             - Build the kernel as a plain directory structure'
 	@echo '  tar-pkg             - Build the kernel as an uncompressed tarball'
 	@echo '  targz-pkg           - Build the kernel as a gzip compressed tarball'
diff --git a/scripts/package/PKGBUILD b/scripts/package/PKGBUILD
new file mode 100644
index 000000000000..f83493838cf9
--- /dev/null
+++ b/scripts/package/PKGBUILD
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Maintainer: Thomas Weißschuh <linux@weissschuh.net>
+# Contributor: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
+
+pkgbase=${PACMAN_PKGBASE:-linux-upstream}
+pkgname=("${pkgbase}")
+
+_extrapackages=${PACMAN_EXTRAPACKAGES-headers api-headers debug}
+for pkg in $_extrapackages; do
+	pkgname+=("${pkgbase}-${pkg}")
+done
+
+pkgver="${KERNELRELEASE//-/_}"
+# The PKGBUILD is evaluated multiple times.
+# Running scripts/build-version from here would introduce inconsistencies.
+pkgrel="${KBUILD_REVISION}"
+pkgdesc='Upstream Linux'
+url='https://www.kernel.org/'
+# Enable flexible cross-compilation
+arch=(${CARCH})
+license=(GPL-2.0-only)
+makedepends=(
+	bc
+	bison
+	cpio
+	flex
+	gettext
+	kmod
+	libelf
+	openssl
+	pahole
+	perl
+	python
+	rsync
+	tar
+)
+options=(!debug !strip !buildflags !makeflags)
+
+_prologue() {
+	# MAKEFLAGS from makepkg.conf override the ones inherited from kbuild.
+	# Bypass this override with a custom variable.
+	export MAKEFLAGS="${KBUILD_MAKEFLAGS}"
+
+	# Kbuild works in the output directory, where this PKGBUILD is located.
+	cd "$(dirname "${BASH_SOURCE[0]}")"
+}
+
+build() {
+	_prologue
+
+	${MAKE} KERNELRELEASE="${KERNELRELEASE}" KBUILD_BUILD_VERSION="${pkgrel}"
+}
+
+_package() {
+	pkgdesc="The ${pkgdesc} kernel and modules"
+
+	local modulesdir="${pkgdir}/usr/${MODLIB}"
+
+	_prologue
+
+	echo "Installing boot image..."
+	# systemd expects to find the kernel here to allow hibernation
+	# https://github.com/systemd/systemd/commit/edda44605f06a41fb86b7ab8128dcf99161d2344
+	install -Dm644 "$(${MAKE} -s image_name)" "${modulesdir}/vmlinuz"
+
+	# Used by mkinitcpio to name the kernel
+	echo "${pkgbase}" > "${modulesdir}/pkgbase"
+
+	echo "Installing modules..."
+	${MAKE} INSTALL_MOD_PATH="${pkgdir}/usr" INSTALL_MOD_STRIP=1 \
+		DEPMOD=true modules_install
+
+	if [ -d "${srctree}/arch/${SRCARCH}/boot/dts" ]; then
+		echo "Installing dtbs..."
+		${MAKE} INSTALL_DTBS_PATH="${modulesdir}/dtb" dtbs_install
+	fi
+
+	# remove build link, will be part of -headers package
+	rm -f "${modulesdir}/build"
+}
+
+_package-headers() {
+	pkgdesc="Headers and scripts for building modules for the ${pkgdesc} kernel"
+
+	local builddir="${pkgdir}/usr/${MODLIB}/build"
+
+	_prologue
+
+	if grep -q CONFIG_MODULES=y include/config/auto.conf; then
+		echo "Installing build files..."
+		"${srctree}/scripts/package/install-extmod-build" "${builddir}"
+	fi
+
+	echo "Installing System.map and config..."
+	mkdir -p "${builddir}"
+	cp System.map "${builddir}/System.map"
+	cp .config "${builddir}/.config"
+
+	echo "Adding symlink..."
+	mkdir -p "${pkgdir}/usr/src"
+	ln -sr "${builddir}" "${pkgdir}/usr/src/${pkgbase}"
+}
+
+_package-api-headers() {
+	pkgdesc="Kernel headers sanitized for use in userspace"
+	provides=(linux-api-headers)
+	conflicts=(linux-api-headers)
+
+	_prologue
+
+	${MAKE} headers_install INSTALL_HDR_PATH="${pkgdir}/usr"
+}
+
+_package-debug(){
+	pkgdesc="Non-stripped vmlinux file for the ${pkgdesc} kernel"
+
+	local debugdir="${pkgdir}/usr/src/debug/${pkgbase}"
+	local builddir="${pkgdir}/usr/${MODLIB}/build"
+
+	_prologue
+
+	install -Dt "${debugdir}" -m644 vmlinux
+	mkdir -p "${builddir}"
+	ln -sr "${debugdir}/vmlinux" "${builddir}/vmlinux"
+}
+
+for _p in "${pkgname[@]}"; do
+	eval "package_$_p() {
+		$(declare -f "_package${_p#$pkgbase}")
+		_package${_p#$pkgbase}
+	}"
+done
-- 
2.46.0

From fc2f414e83a32041470dff37ba469ce8dca9a5c5 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sun, 8 Sep 2024 10:20:33 +0200
Subject: [PATCH 05/12] crypto

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 arch/x86/crypto/Kconfig                  |    1 +
 arch/x86/crypto/Makefile                 |    8 +-
 arch/x86/crypto/aes-gcm-aesni-x86_64.S   | 1128 +++++++++
 arch/x86/crypto/aes-gcm-avx10-x86_64.S   | 1222 ++++++++++
 arch/x86/crypto/aesni-intel_asm.S        | 1503 +-----------
 arch/x86/crypto/aesni-intel_avx-x86_64.S | 2804 ----------------------
 arch/x86/crypto/aesni-intel_glue.c       | 1269 ++++++----
 7 files changed, 3125 insertions(+), 4810 deletions(-)
 create mode 100644 arch/x86/crypto/aes-gcm-aesni-x86_64.S
 create mode 100644 arch/x86/crypto/aes-gcm-avx10-x86_64.S
 delete mode 100644 arch/x86/crypto/aesni-intel_avx-x86_64.S

diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig
index c9e59589a1ce..24875e6295f2 100644
--- a/arch/x86/crypto/Kconfig
+++ b/arch/x86/crypto/Kconfig
@@ -18,6 +18,7 @@ config CRYPTO_AES_NI_INTEL
 	depends on X86
 	select CRYPTO_AEAD
 	select CRYPTO_LIB_AES
+	select CRYPTO_LIB_GF128MUL
 	select CRYPTO_ALGAPI
 	select CRYPTO_SKCIPHER
 	select CRYPTO_SIMD
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 9c5ce5613738..53b4a277809e 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -48,8 +48,12 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o
 
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \
-	aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o
+aesni-intel-$(CONFIG_64BIT) += aes_ctrby8_avx-x86_64.o \
+			       aes-gcm-aesni-x86_64.o \
+			       aes-xts-avx-x86_64.o
+ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy)
+aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o
+endif
 
 obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
 sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o
diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
new file mode 100644
index 000000000000..45940e2883a0
--- /dev/null
+++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S
@@ -0,0 +1,1128 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// AES-NI optimized AES-GCM for x86_64
+//
+// Copyright 2024 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
+// of the License at
+//
+//	http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+//
+// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
+// support the original set of AES instructions, i.e. AES-NI.  Two
+// implementations are provided, one that uses AVX and one that doesn't.  They
+// are very similar, being generated by the same macros.  The only difference is
+// that the AVX implementation takes advantage of VEX-coded instructions in some
+// places to avoid some 'movdqu' and 'movdqa' instructions.  The AVX
+// implementation does *not* use 256-bit vectors, as AES is not supported on
+// 256-bit vectors until the VAES feature (which this file doesn't target).
+//
+// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
+// for the *_aesni functions or AVX for the *_aesni_avx ones.  (But it seems
+// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
+//
+// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
+// more thoroughly commented.  This file has the following notable changes:
+//
+//    - The vector length is fixed at 128-bit, i.e. xmm registers.  This means
+//      there is only one AES block (and GHASH block) per register.
+//
+//    - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
+//      32.  We work around this by being much more careful about using
+//      registers, relying heavily on loads to load values as they are needed.
+//
+//    - Masking is not available either.  We work around this by implementing
+//      partial block loads and stores using overlapping scalar loads and stores
+//      combined with shifts and SSE4.1 insertion and extraction instructions.
+//
+//    - The main loop is organized differently due to the different design
+//      constraints.  First, with just one AES block per SIMD register, on some
+//      CPUs 4 registers don't saturate the 'aesenc' throughput.  We therefore
+//      do an 8-register wide loop.  Considering that and the fact that we have
+//      just 16 SIMD registers to work with, it's not feasible to cache AES
+//      round keys and GHASH key powers in registers across loop iterations.
+//      That's not ideal, but also not actually that bad, since loads can run in
+//      parallel with other instructions.  Significantly, this also makes it
+//      possible to roll up the inner loops, relying on hardware loop unrolling
+//      instead of software loop unrolling, greatly reducing code size.
+//
+//    - We implement the GHASH multiplications in the main loop using Karatsuba
+//      multiplication instead of schoolbook multiplication.  This saves one
+//      pclmulqdq instruction per block, at the cost of one 64-bit load, one
+//      pshufd, and 0.25 pxors per block.  (This is without the three-argument
+//      XOR support that would be provided by AVX512 / AVX10, which would be
+//      more beneficial to schoolbook than Karatsuba.)
+//
+//      As a rough approximation, we can assume that Karatsuba multiplication is
+//      faster than schoolbook multiplication in this context if one pshufd and
+//      0.25 pxors are cheaper than a pclmulqdq.  (We assume that the 64-bit
+//      load is "free" due to running in parallel with arithmetic instructions.)
+//      This is true on AMD CPUs, including all that support pclmulqdq up to at
+//      least Zen 3.  It's also true on older Intel CPUs: Westmere through
+//      Haswell on the Core side, and Silvermont through Goldmont Plus on the
+//      low-power side.  On some of these CPUs, pclmulqdq is quite slow, and the
+//      benefit of Karatsuba should be substantial.  On newer Intel CPUs,
+//      schoolbook multiplication should be faster, but only marginally.
+//
+//      Not all these CPUs were available to be tested.  However, benchmarks on
+//      available CPUs suggest that this approximation is plausible.  Switching
+//      to Karatsuba showed negligible change (< 1%) on Intel Broadwell,
+//      Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
+//      Considering that and the fact that Karatsuba should be even more
+//      beneficial on older Intel CPUs, it seems like the right choice here.
+//
+//      An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
+//      saved by using a multiplication-less reduction method.  We don't do that
+//      because it would require a large number of shift and xor instructions,
+//      making it less worthwhile and likely harmful on newer CPUs.
+//
+//      It does make sense to sometimes use a different reduction optimization
+//      that saves a pclmulqdq, though: precompute the hash key times x^64, and
+//      multiply the low half of the data block by the hash key with the extra
+//      factor of x^64.  This eliminates one step of the reduction.  However,
+//      this is incompatible with Karatsuba multiplication.  Therefore, for
+//      multi-block processing we use Karatsuba multiplication with a regular
+//      reduction.  For single-block processing, we use the x^64 optimization.
+
+#include <linux/linkage.h>
+
+.section .rodata
+.p2align 4
+.Lbswap_mask:
+	.octa   0x000102030405060708090a0b0c0d0e0f
+.Lgfpoly:
+	.quad	0xc200000000000000
+.Lone:
+	.quad	1
+.Lgfpoly_and_internal_carrybit:
+	.octa	0xc2000000000000010000000000000001
+	// Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
+	// 'len' 0xff bytes and the rest zeroes.
+.Lzeropad_mask:
+	.octa	0xffffffffffffffffffffffffffffffff
+	.octa	0
+
+// Offsets in struct aes_gcm_key_aesni
+#define OFFSETOF_AESKEYLEN	480
+#define OFFSETOF_H_POWERS	496
+#define OFFSETOF_H_POWERS_XORED	624
+#define OFFSETOF_H_TIMES_X64	688
+
+.text
+
+// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq.  The fallback
+// assumes that all operands are distinct and that any mem operand is aligned.
+.macro	_vpclmulqdq	imm, src1, src2, dst
+.if USE_AVX
+	vpclmulqdq	\imm, \src1, \src2, \dst
+.else
+	movdqa		\src2, \dst
+	pclmulqdq	\imm, \src1, \dst
+.endif
+.endm
+
+// Do a vpshufb, or fall back to a movdqa and a pshufb.  The fallback assumes
+// that all operands are distinct and that any mem operand is aligned.
+.macro	_vpshufb	src1, src2, dst
+.if USE_AVX
+	vpshufb		\src1, \src2, \dst
+.else
+	movdqa		\src2, \dst
+	pshufb		\src1, \dst
+.endif
+.endm
+
+// Do a vpand, or fall back to a movdqu and a pand.  The fallback assumes that
+// all operands are distinct.
+.macro	_vpand		src1, src2, dst
+.if USE_AVX
+	vpand		\src1, \src2, \dst
+.else
+	movdqu		\src1, \dst
+	pand		\src2, \dst
+.endif
+.endm
+
+// XOR the unaligned memory operand \mem into the xmm register \reg.  \tmp must
+// be a temporary xmm register.
+.macro	_xor_mem_to_reg	mem, reg, tmp
+.if USE_AVX
+	vpxor		\mem, \reg, \reg
+.else
+	movdqu		\mem, \tmp
+	pxor		\tmp, \reg
+.endif
+.endm
+
+// Test the unaligned memory operand \mem against the xmm register \reg.  \tmp
+// must be a temporary xmm register.
+.macro	_test_mem	mem, reg, tmp
+.if USE_AVX
+	vptest		\mem, \reg
+.else
+	movdqu		\mem, \tmp
+	ptest		\tmp, \reg
+.endif
+.endm
+
+// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
+// and zeroize any remaining bytes.  Clobbers %rax, %rcx, and \tmp{64,32}.
+.macro	_load_partial_block	src, dst, tmp64, tmp32
+	sub		$8, %ecx		// LEN - 8
+	jle		.Lle8\@
+
+	// Load 9 <= LEN <= 15 bytes.
+	movq		(\src), \dst		// Load first 8 bytes
+	mov		(\src, %rcx), %rax	// Load last 8 bytes
+	neg		%ecx
+	shl		$3, %ecx
+	shr		%cl, %rax		// Discard overlapping bytes
+	pinsrq		$1, %rax, \dst
+	jmp		.Ldone\@
+
+.Lle8\@:
+	add		$4, %ecx		// LEN - 4
+	jl		.Llt4\@
+
+	// Load 4 <= LEN <= 8 bytes.
+	mov		(\src), %eax		// Load first 4 bytes
+	mov		(\src, %rcx), \tmp32	// Load last 4 bytes
+	jmp		.Lcombine\@
+
+.Llt4\@:
+	// Load 1 <= LEN <= 3 bytes.
+	add		$2, %ecx		// LEN - 2
+	movzbl		(\src), %eax		// Load first byte
+	jl		.Lmovq\@
+	movzwl		(\src, %rcx), \tmp32	// Load last 2 bytes
+.Lcombine\@:
+	shl		$3, %ecx
+	shl		%cl, \tmp64
+	or		\tmp64, %rax		// Combine the two parts
+.Lmovq\@:
+	movq		%rax, \dst
+.Ldone\@:
+.endm
+
+// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
+// Clobbers %rax, %rcx, and %rsi.
+.macro	_store_partial_block	src, dst
+	sub		$8, %ecx		// LEN - 8
+	jl		.Llt8\@
+
+	// Store 8 <= LEN <= 15 bytes.
+	pextrq		$1, \src, %rax
+	mov		%ecx, %esi
+	shl		$3, %ecx
+	ror		%cl, %rax
+	mov		%rax, (\dst, %rsi)	// Store last LEN - 8 bytes
+	movq		\src, (\dst)		// Store first 8 bytes
+	jmp		.Ldone\@
+
+.Llt8\@:
+	add		$4, %ecx		// LEN - 4
+	jl		.Llt4\@
+
+	// Store 4 <= LEN <= 7 bytes.
+	pextrd		$1, \src, %eax
+	mov		%ecx, %esi
+	shl		$3, %ecx
+	ror		%cl, %eax
+	mov		%eax, (\dst, %rsi)	// Store last LEN - 4 bytes
+	movd		\src, (\dst)		// Store first 4 bytes
+	jmp		.Ldone\@
+
+.Llt4\@:
+	// Store 1 <= LEN <= 3 bytes.
+	pextrb		$0, \src, 0(\dst)
+	cmp		$-2, %ecx		// LEN - 4 == -2, i.e. LEN == 2?
+	jl		.Ldone\@
+	pextrb		$1, \src, 1(\dst)
+	je		.Ldone\@
+	pextrb		$2, \src, 2(\dst)
+.Ldone\@:
+.endm
+
+// Do one step of GHASH-multiplying \a by \b and storing the reduced product in
+// \b.  To complete all steps, this must be invoked with \i=0 through \i=9.
+// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
+// .Lgfpoly constant, and \t0-\t1 must be temporary registers.
+.macro	_ghash_mul_step	i, a, a_times_x64, b, gfpoly, t0, t1
+
+	// MI = (a_L * b_H) + ((a*x^64)_L * b_L)
+.if \i == 0
+	_vpclmulqdq	$0x01, \a, \b, \t0
+.elseif \i == 1
+	_vpclmulqdq	$0x00, \a_times_x64, \b, \t1
+.elseif \i == 2
+	pxor		\t1, \t0
+
+	// HI = (a_H * b_H) + ((a*x^64)_H * b_L)
+.elseif \i == 3
+	_vpclmulqdq	$0x11, \a, \b, \t1
+.elseif \i == 4
+	pclmulqdq	$0x10, \a_times_x64, \b
+.elseif \i == 5
+	pxor		\t1, \b
+.elseif \i == 6
+
+	// Fold MI into HI.
+	pshufd		$0x4e, \t0, \t1		// Swap halves of MI
+.elseif \i == 7
+	pclmulqdq	$0x00, \gfpoly, \t0	// MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+	pxor		\t1, \b
+.elseif \i == 9
+	pxor		\t0, \b
+.endif
+.endm
+
+// GHASH-multiply \a by \b and store the reduced product in \b.
+// See _ghash_mul_step for details.
+.macro	_ghash_mul	a, a_times_x64, b, gfpoly, t0, t1
+.irp i, 0,1,2,3,4,5,6,7,8,9
+	_ghash_mul_step	\i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
+.endr
+.endm
+
+// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
+// This does Karatsuba multiplication and must be paired with _ghash_reduce.  On
+// the first call, \lo, \mi, and \hi must be zero.  \a_xored must contain the
+// two halves of \a XOR'd together, i.e. a_L + a_H.  \b is clobbered.
+.macro	_ghash_mul_noreduce	a, a_xored, b, lo, mi, hi, t0
+
+	// LO += a_L * b_L
+	_vpclmulqdq	$0x00, \a, \b, \t0
+	pxor		\t0, \lo
+
+	// b_L + b_H
+	pshufd		$0x4e, \b, \t0
+	pxor		\b, \t0
+
+	// HI += a_H * b_H
+	pclmulqdq	$0x11, \a, \b
+	pxor		\b, \hi
+
+	// MI += (a_L + a_H) * (b_L + b_H)
+	pclmulqdq	$0x00, \a_xored, \t0
+	pxor		\t0, \mi
+.endm
+
+// Reduce the product from \lo, \mi, and \hi, and store the result in \dst.
+// This assumes that _ghash_mul_noreduce was used.
+.macro	_ghash_reduce	lo, mi, hi, dst, t0
+
+	movq		.Lgfpoly(%rip), \t0
+
+	// MI += LO + HI (needed because we used Karatsuba multiplication)
+	pxor		\lo, \mi
+	pxor		\hi, \mi
+
+	// Fold LO into MI.
+	pshufd		$0x4e, \lo, \dst
+	pclmulqdq	$0x00, \t0, \lo
+	pxor		\dst, \mi
+	pxor		\lo, \mi
+
+	// Fold MI into HI.
+	pshufd		$0x4e, \mi, \dst
+	pclmulqdq	$0x00, \t0, \mi
+	pxor		\hi, \dst
+	pxor		\mi, \dst
+.endm
+
+// Do the first step of the GHASH update of a set of 8 ciphertext blocks.
+//
+// The whole GHASH update does:
+//
+//	GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
+//				blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1
+//
+// This macro just does the first step: it does the unreduced multiplication
+// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
+// registers LO, MI, and GHASH_ACC a.k.a. HI.  It also zero-initializes the
+// inner block counter in %rax, which is a value that counts up by 8 for each
+// block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
+//
+// To reduce the number of pclmulqdq instructions required, both this macro and
+// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook
+// multiplication.  See the file comment for more details about this choice.
+//
+// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
+// encrypting, or SRC if decrypting.  They also expect the precomputed hash key
+// powers H^i and their XOR'd-together halves to be available in the struct
+// pointed to by KEY.  Both macros clobber TMP[0-2].
+.macro	_ghash_update_begin_8x	enc
+
+	// Initialize the inner block counter.
+	xor		%eax, %eax
+
+	// Load the highest hash key power, H^8.
+	movdqa		OFFSETOF_H_POWERS(KEY), TMP0
+
+	// Load the first ciphertext block and byte-reflect it.
+.if \enc
+	movdqu		(DST), TMP1
+.else
+	movdqu		(SRC), TMP1
+.endif
+	pshufb		BSWAP_MASK, TMP1
+
+	// Add the GHASH accumulator to the ciphertext block to get the block
+	// 'b' that needs to be multiplied with the hash key power 'a'.
+	pxor		TMP1, GHASH_ACC
+
+	// b_L + b_H
+	pshufd		$0x4e, GHASH_ACC, MI
+	pxor		GHASH_ACC, MI
+
+	// LO = a_L * b_L
+	_vpclmulqdq	$0x00, TMP0, GHASH_ACC, LO
+
+	// HI = a_H * b_H
+	pclmulqdq	$0x11, TMP0, GHASH_ACC
+
+	// MI = (a_L + a_H) * (b_L + b_H)
+	pclmulqdq	$0x00, OFFSETOF_H_POWERS_XORED(KEY), MI
+.endm
+
+// Continue the GHASH update of 8 ciphertext blocks as described above by doing
+// an unreduced multiplication of the next ciphertext block by the next lowest
+// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
+.macro	_ghash_update_continue_8x enc
+	add		$8, %eax
+
+	// Load the next lowest key power.
+	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), TMP0
+
+	// Load the next ciphertext block and byte-reflect it.
+.if \enc
+	movdqu		(DST,%rax,2), TMP1
+.else
+	movdqu		(SRC,%rax,2), TMP1
+.endif
+	pshufb		BSWAP_MASK, TMP1
+
+	// LO += a_L * b_L
+	_vpclmulqdq	$0x00, TMP0, TMP1, TMP2
+	pxor		TMP2, LO
+
+	// b_L + b_H
+	pshufd		$0x4e, TMP1, TMP2
+	pxor		TMP1, TMP2
+
+	// HI += a_H * b_H
+	pclmulqdq	$0x11, TMP0, TMP1
+	pxor		TMP1, GHASH_ACC
+
+	// MI += (a_L + a_H) * (b_L + b_H)
+	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1
+	pclmulqdq	$0x00, TMP1, TMP2
+	pxor		TMP2, MI
+.endm
+
+// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC.  This is similar to
+// _ghash_reduce, but it's hardcoded to use the registers of the main loop and
+// it uses the same register for HI and the destination.  It's also divided into
+// two steps.  TMP1 must be preserved across steps.
+//
+// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of
+// shuffling LO, XOR'ing LO into MI, and shuffling MI.  However, this would
+// increase the critical path length, and it seems to slightly hurt performance.
+.macro	_ghash_update_end_8x_step	i
+.if \i == 0
+	movq		.Lgfpoly(%rip), TMP1
+	pxor		LO, MI
+	pxor		GHASH_ACC, MI
+	pshufd		$0x4e, LO, TMP2
+	pclmulqdq	$0x00, TMP1, LO
+	pxor		TMP2, MI
+	pxor		LO, MI
+.elseif \i == 1
+	pshufd		$0x4e, MI, TMP2
+	pclmulqdq	$0x00, TMP1, MI
+	pxor		TMP2, GHASH_ACC
+	pxor		MI, GHASH_ACC
+.endif
+.endm
+
+// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);
+//
+// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
+// related fields in the key struct.
+.macro	_aes_gcm_precompute
+
+	// Function arguments
+	.set	KEY,		%rdi
+
+	// Additional local variables.
+	// %xmm0-%xmm1 and %rax are used as temporaries.
+	.set	RNDKEYLAST_PTR,	%rsi
+	.set	H_CUR,		%xmm2
+	.set	H_POW1,		%xmm3	// H^1
+	.set	H_POW1_X64,	%xmm4	// H^1 * x^64
+	.set	GFPOLY,		%xmm5
+
+	// Encrypt an all-zeroes block to get the raw hash subkey.
+	movl		OFFSETOF_AESKEYLEN(KEY), %eax
+	lea		6*16(KEY,%rax,4), RNDKEYLAST_PTR
+	movdqa		(KEY), H_POW1  // Zero-th round key XOR all-zeroes block
+	lea		16(KEY), %rax
+1:
+	aesenc		(%rax), H_POW1
+	add		$16, %rax
+	cmp		%rax, RNDKEYLAST_PTR
+	jne		1b
+	aesenclast	(RNDKEYLAST_PTR), H_POW1
+
+	// Preprocess the raw hash subkey as needed to operate on GHASH's
+	// bit-reflected values directly: reflect its bytes, then multiply it by
+	// x^-1 (using the backwards interpretation of polynomial coefficients
+	// from the GCM spec) or equivalently x^1 (using the alternative,
+	// natural interpretation of polynomial coefficients).
+	pshufb		.Lbswap_mask(%rip), H_POW1
+	movdqa		H_POW1, %xmm0
+	pshufd		$0xd3, %xmm0, %xmm0
+	psrad		$31, %xmm0
+	paddq		H_POW1, H_POW1
+	pand		.Lgfpoly_and_internal_carrybit(%rip), %xmm0
+	pxor		%xmm0, H_POW1
+
+	// Store H^1.
+	movdqa		H_POW1, OFFSETOF_H_POWERS+7*16(KEY)
+
+	// Compute and store H^1 * x^64.
+	movq		.Lgfpoly(%rip), GFPOLY
+	pshufd		$0x4e, H_POW1, %xmm0
+	_vpclmulqdq	$0x00, H_POW1, GFPOLY, H_POW1_X64
+	pxor		%xmm0, H_POW1_X64
+	movdqa		H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)
+
+	// Compute and store the halves of H^1 XOR'd together.
+	pxor		H_POW1, %xmm0
+	movq		%xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)
+
+	// Compute and store the remaining key powers H^2 through H^8.
+	movdqa		H_POW1, H_CUR
+	mov		$6*8, %eax
+.Lprecompute_next\@:
+	// Compute H^i = H^{i-1} * H^1.
+	_ghash_mul	H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1
+	// Store H^i.
+	movdqa		H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)
+	// Compute and store the halves of H^i XOR'd together.
+	pshufd		$0x4e, H_CUR, %xmm0
+	pxor		H_CUR, %xmm0
+	movq		%xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)
+	sub		$8, %eax
+	jge		.Lprecompute_next\@
+
+	RET
+.endm
+
+// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
+//				 u8 ghash_acc[16], const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|.  On the first call, |ghash_acc| must be all
+// zeroes.  |aadlen| must be a multiple of 16, except on the last call where it
+// can be any length.  The caller must do any buffering needed to ensure this.
+.macro	_aes_gcm_aad_update
+
+	// Function arguments
+	.set	KEY,		%rdi
+	.set	GHASH_ACC_PTR,	%rsi
+	.set	AAD,		%rdx
+	.set	AADLEN,		%ecx
+	// Note: _load_partial_block relies on AADLEN being in %ecx.
+
+	// Additional local variables.
+	// %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
+	.set	BSWAP_MASK,	%xmm2
+	.set	GHASH_ACC,	%xmm3
+	.set	H_POW1,		%xmm4	// H^1
+	.set	H_POW1_X64,	%xmm5	// H^1 * x^64
+	.set	GFPOLY,		%xmm6
+
+	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
+	movdqu		(GHASH_ACC_PTR), GHASH_ACC
+	movdqa		OFFSETOF_H_POWERS+7*16(KEY), H_POW1
+	movdqa		OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
+	movq		.Lgfpoly(%rip), GFPOLY
+
+	// Process the AAD one full block at a time.
+	sub		$16, AADLEN
+	jl		.Laad_loop_1x_done\@
+.Laad_loop_1x\@:
+	movdqu		(AAD), %xmm0
+	pshufb		BSWAP_MASK, %xmm0
+	pxor		%xmm0, GHASH_ACC
+	_ghash_mul	H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
+	add		$16, AAD
+	sub		$16, AADLEN
+	jge		.Laad_loop_1x\@
+.Laad_loop_1x_done\@:
+	// Check whether there is a partial block at the end.
+	add		$16, AADLEN
+	jz		.Laad_done\@
+
+	// Process a partial block of length 1 <= AADLEN <= 15.
+	// _load_partial_block assumes that %ecx contains AADLEN.
+	_load_partial_block	AAD, %xmm0, %r10, %r10d
+	pshufb		BSWAP_MASK, %xmm0
+	pxor		%xmm0, GHASH_ACC
+	_ghash_mul	H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
+
+.Laad_done\@:
+	movdqu		GHASH_ACC, (GHASH_ACC_PTR)
+	RET
+.endm
+
+// Increment LE_CTR eight times to generate eight little-endian counter blocks,
+// swap each to big-endian, and store them in AESDATA[0-7].  Also XOR them with
+// the zero-th AES round key.  Clobbers TMP0 and TMP1.
+.macro	_ctr_begin_8x
+	movq		.Lone(%rip), TMP0
+	movdqa		(KEY), TMP1		// zero-th round key
+.irp i, 0,1,2,3,4,5,6,7
+	_vpshufb	BSWAP_MASK, LE_CTR, AESDATA\i
+	pxor		TMP1, AESDATA\i
+	paddd		TMP0, LE_CTR
+.endr
+.endm
+
+// Do a non-last round of AES on AESDATA[0-7] using \round_key.
+.macro	_aesenc_8x	round_key
+.irp i, 0,1,2,3,4,5,6,7
+	aesenc		\round_key, AESDATA\i
+.endr
+.endm
+
+// Do the last round of AES on AESDATA[0-7] using \round_key.
+.macro	_aesenclast_8x	round_key
+.irp i, 0,1,2,3,4,5,6,7
+	aesenclast	\round_key, AESDATA\i
+.endr
+.endm
+
+// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
+// store the result to DST.  Clobbers TMP0.
+.macro	_xor_data_8x
+.irp i, 0,1,2,3,4,5,6,7
+	_xor_mem_to_reg	\i*16(SRC), AESDATA\i, tmp=TMP0
+.endr
+.irp i, 0,1,2,3,4,5,6,7
+	movdqu		AESDATA\i, \i*16(DST)
+.endr
+.endm
+
+// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,
+//					  const u32 le_ctr[4], u8 ghash_acc[16],
+//					  const u8 *src, u8 *dst, int datalen);
+//
+// This macro generates a GCM encryption or decryption update function with the
+// above prototype (with \enc selecting which one).
+//
+// This function computes the next portion of the CTR keystream, XOR's it with
+// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
+// data to |dst|.  It also updates the GHASH accumulator |ghash_acc| using the
+// next |datalen| ciphertext bytes.
+//
+// |datalen| must be a multiple of 16, except on the last call where it can be
+// any length.  The caller must do any buffering needed to ensure this.  Both
+// in-place and out-of-place en/decryption are supported.
+//
+// |le_ctr| must give the current counter in little-endian format.  For a new
+// message, the low word of the counter must be 2.  This function loads the
+// counter from |le_ctr| and increments the loaded counter as needed, but it
+// does *not* store the updated counter back to |le_ctr|.  The caller must
+// update |le_ctr| if any more data segments follow.  Internally, only the low
+// 32-bit word of the counter is incremented, following the GCM standard.
+.macro	_aes_gcm_update	enc
+
+	// Function arguments
+	.set	KEY,		%rdi
+	.set	LE_CTR_PTR,	%rsi	// Note: overlaps with usage as temp reg
+	.set	GHASH_ACC_PTR,	%rdx
+	.set	SRC,		%rcx
+	.set	DST,		%r8
+	.set	DATALEN,	%r9d
+	.set	DATALEN64,	%r9	// Zero-extend DATALEN before using!
+	// Note: the code setting up for _load_partial_block assumes that SRC is
+	// in %rcx (and that DATALEN is *not* in %rcx).
+
+	// Additional local variables
+
+	// %rax and %rsi are used as temporary registers.  Note: %rsi overlaps
+	// with LE_CTR_PTR, which is used only at the beginning.
+
+	.set	AESKEYLEN,	%r10d	// AES key length in bytes
+	.set	AESKEYLEN64,	%r10
+	.set	RNDKEYLAST_PTR,	%r11	// Pointer to last AES round key
+
+	// Put the most frequently used values in %xmm0-%xmm7 to reduce code
+	// size.  (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
+	.set	TMP0,		%xmm0
+	.set	TMP1,		%xmm1
+	.set	TMP2,		%xmm2
+	.set	LO,		%xmm3	// Low part of unreduced product
+	.set	MI,		%xmm4	// Middle part of unreduced product
+	.set	GHASH_ACC,	%xmm5	// GHASH accumulator; in main loop also
+					// the high part of unreduced product
+	.set	BSWAP_MASK,	%xmm6	// Shuffle mask for reflecting bytes
+	.set	LE_CTR,		%xmm7	// Little-endian counter value
+	.set	AESDATA0,	%xmm8
+	.set	AESDATA1,	%xmm9
+	.set	AESDATA2,	%xmm10
+	.set	AESDATA3,	%xmm11
+	.set	AESDATA4,	%xmm12
+	.set	AESDATA5,	%xmm13
+	.set	AESDATA6,	%xmm14
+	.set	AESDATA7,	%xmm15
+
+	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
+	movdqu		(GHASH_ACC_PTR), GHASH_ACC
+	movdqu		(LE_CTR_PTR), LE_CTR
+
+	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+	lea		6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
+
+	// If there are at least 8*16 bytes of data, then continue into the main
+	// loop, which processes 8*16 bytes of data per iteration.
+	//
+	// The main loop interleaves AES and GHASH to improve performance on
+	// CPUs that can execute these instructions in parallel.  When
+	// decrypting, the GHASH input (the ciphertext) is immediately
+	// available.  When encrypting, we instead encrypt a set of 8 blocks
+	// first and then GHASH those blocks while encrypting the next set of 8,
+	// repeat that as needed, and finally GHASH the last set of 8 blocks.
+	//
+	// Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
+	// as this makes the immediate fit in a signed byte, saving 3 bytes.
+	add		$-8*16, DATALEN
+	jl		.Lcrypt_loop_8x_done\@
+.if \enc
+	// Encrypt the first 8 plaintext blocks.
+	_ctr_begin_8x
+	lea		16(KEY), %rsi
+	.p2align 4
+1:
+	movdqa		(%rsi), TMP0
+	_aesenc_8x	TMP0
+	add		$16, %rsi
+	cmp		%rsi, RNDKEYLAST_PTR
+	jne		1b
+	movdqa		(%rsi), TMP0
+	_aesenclast_8x	TMP0
+	_xor_data_8x
+	// Don't increment DST until the ciphertext blocks have been hashed.
+	sub		$-8*16, SRC
+	add		$-8*16, DATALEN
+	jl		.Lghash_last_ciphertext_8x\@
+.endif
+
+	.p2align 4
+.Lcrypt_loop_8x\@:
+
+	// Generate the next set of 8 counter blocks and start encrypting them.
+	_ctr_begin_8x
+	lea		16(KEY), %rsi
+
+	// Do a round of AES, and start the GHASH update of 8 ciphertext blocks
+	// by doing the unreduced multiplication for the first ciphertext block.
+	movdqa		(%rsi), TMP0
+	add		$16, %rsi
+	_aesenc_8x	TMP0
+	_ghash_update_begin_8x \enc
+
+	// Do 7 more rounds of AES, and continue the GHASH update by doing the
+	// unreduced multiplication for the remaining ciphertext blocks.
+	.p2align 4
+1:
+	movdqa		(%rsi), TMP0
+	add		$16, %rsi
+	_aesenc_8x	TMP0
+	_ghash_update_continue_8x \enc
+	cmp		$7*8, %eax
+	jne		1b
+
+	// Do the remaining AES rounds.
+	.p2align 4
+1:
+	movdqa		(%rsi), TMP0
+	add		$16, %rsi
+	_aesenc_8x	TMP0
+	cmp		%rsi, RNDKEYLAST_PTR
+	jne		1b
+
+	// Do the GHASH reduction and the last round of AES.
+	movdqa		(RNDKEYLAST_PTR), TMP0
+	_ghash_update_end_8x_step	0
+	_aesenclast_8x	TMP0
+	_ghash_update_end_8x_step	1
+
+	// XOR the data with the AES-CTR keystream blocks.
+.if \enc
+	sub		$-8*16, DST
+.endif
+	_xor_data_8x
+	sub		$-8*16, SRC
+.if !\enc
+	sub		$-8*16, DST
+.endif
+	add		$-8*16, DATALEN
+	jge		.Lcrypt_loop_8x\@
+
+.if \enc
+.Lghash_last_ciphertext_8x\@:
+	// Update GHASH with the last set of 8 ciphertext blocks.
+	_ghash_update_begin_8x		\enc
+	.p2align 4
+1:
+	_ghash_update_continue_8x	\enc
+	cmp		$7*8, %eax
+	jne		1b
+	_ghash_update_end_8x_step	0
+	_ghash_update_end_8x_step	1
+	sub		$-8*16, DST
+.endif
+
+.Lcrypt_loop_8x_done\@:
+
+	sub		$-8*16, DATALEN
+	jz		.Ldone\@
+
+	// Handle the remainder of length 1 <= DATALEN < 8*16 bytes.  We keep
+	// things simple and keep the code size down by just going one block at
+	// a time, again taking advantage of hardware loop unrolling.  Since
+	// there are enough key powers available for all remaining data, we do
+	// the GHASH multiplications unreduced, and only reduce at the very end.
+
+	.set	HI,		TMP2
+	.set	H_POW,		AESDATA0
+	.set	H_POW_XORED,	AESDATA1
+	.set	ONE,		AESDATA2
+
+	movq		.Lone(%rip), ONE
+
+	// Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+	pxor		LO, LO
+	pxor		MI, MI
+	pxor		HI, HI
+
+	// Set up a block counter %rax to contain 8*(8-n), where n is the number
+	// of blocks that remain, counting any partial block.  This will be used
+	// to access the key powers H^n through H^1.
+	mov		DATALEN, %eax
+	neg		%eax
+	and		$~15, %eax
+	sar		$1, %eax
+	add		$64, %eax
+
+	sub		$16, DATALEN
+	jl		.Lcrypt_loop_1x_done\@
+
+	// Process the data one full block at a time.
+.Lcrypt_loop_1x\@:
+
+	// Encrypt the next counter block.
+	_vpshufb	BSWAP_MASK, LE_CTR, TMP0
+	paddd		ONE, LE_CTR
+	pxor		(KEY), TMP0
+	lea		-6*16(RNDKEYLAST_PTR), %rsi	// Reduce code size
+	cmp		$24, AESKEYLEN
+	jl		128f	// AES-128?
+	je		192f	// AES-192?
+	// AES-256
+	aesenc		-7*16(%rsi), TMP0
+	aesenc		-6*16(%rsi), TMP0
+192:
+	aesenc		-5*16(%rsi), TMP0
+	aesenc		-4*16(%rsi), TMP0
+128:
+.irp i, -3,-2,-1,0,1,2,3,4,5
+	aesenc		\i*16(%rsi), TMP0
+.endr
+	aesenclast	(RNDKEYLAST_PTR), TMP0
+
+	// Load the next key power H^i.
+	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
+	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
+
+	// XOR the keystream block that was just generated in TMP0 with the next
+	// source data block and store the resulting en/decrypted data to DST.
+.if \enc
+	_xor_mem_to_reg	(SRC), TMP0, tmp=TMP1
+	movdqu		TMP0, (DST)
+.else
+	movdqu		(SRC), TMP1
+	pxor		TMP1, TMP0
+	movdqu		TMP0, (DST)
+.endif
+
+	// Update GHASH with the ciphertext block.
+.if \enc
+	pshufb		BSWAP_MASK, TMP0
+	pxor		TMP0, GHASH_ACC
+.else
+	pshufb		BSWAP_MASK, TMP1
+	pxor		TMP1, GHASH_ACC
+.endif
+	_ghash_mul_noreduce	H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
+	pxor		GHASH_ACC, GHASH_ACC
+
+	add		$8, %eax
+	add		$16, SRC
+	add		$16, DST
+	sub		$16, DATALEN
+	jge		.Lcrypt_loop_1x\@
+.Lcrypt_loop_1x_done\@:
+	// Check whether there is a partial block at the end.
+	add		$16, DATALEN
+	jz		.Lghash_reduce\@
+
+	// Process a partial block of length 1 <= DATALEN <= 15.
+
+	// Encrypt a counter block for the last time.
+	pshufb		BSWAP_MASK, LE_CTR
+	pxor		(KEY), LE_CTR
+	lea		16(KEY), %rsi
+1:
+	aesenc		(%rsi), LE_CTR
+	add		$16, %rsi
+	cmp		%rsi, RNDKEYLAST_PTR
+	jne		1b
+	aesenclast	(RNDKEYLAST_PTR), LE_CTR
+
+	// Load the lowest key power, H^1.
+	movdqa		OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
+	movq		OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
+
+	// Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC.  SRC is
+	// in %rcx, but _load_partial_block needs DATALEN in %rcx instead.
+	// RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.
+	mov		SRC, RNDKEYLAST_PTR
+	mov		DATALEN, %ecx
+	_load_partial_block	RNDKEYLAST_PTR, TMP0, %rsi, %esi
+
+	// XOR the keystream block that was just generated in LE_CTR with the
+	// source data block and store the resulting en/decrypted data to DST.
+	pxor		TMP0, LE_CTR
+	mov		DATALEN, %ecx
+	_store_partial_block	LE_CTR, DST
+
+	// If encrypting, zero-pad the final ciphertext block for GHASH.  (If
+	// decrypting, this was already done by _load_partial_block.)
+.if \enc
+	lea		.Lzeropad_mask+16(%rip), %rax
+	sub		DATALEN64, %rax
+	_vpand		(%rax), LE_CTR, TMP0
+.endif
+
+	// Update GHASH with the final ciphertext block.
+	pshufb		BSWAP_MASK, TMP0
+	pxor		TMP0, GHASH_ACC
+	_ghash_mul_noreduce	H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
+
+.Lghash_reduce\@:
+	// Finally, do the GHASH reduction.
+	_ghash_reduce	LO, MI, HI, GHASH_ACC, TMP0
+
+.Ldone\@:
+	// Store the updated GHASH accumulator back to memory.
+	movdqu		GHASH_ACC, (GHASH_ACC_PTR)
+
+	RET
+.endm
+
+// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,
+//				   const u32 le_ctr[4], u8 ghash_acc[16],
+//				   u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,
+//				   const u32 le_ctr[4], const u8 ghash_acc[16],
+//				   u64 total_aadlen, u64 total_datalen,
+//				   const u8 tag[16], int taglen);
+//
+// This macro generates one of the above two functions (with \enc selecting
+// which one).  Both functions finish computing the GCM authentication tag by
+// updating GHASH with the lengths block and encrypting the GHASH accumulator.
+// |total_aadlen| and |total_datalen| must be the total length of the additional
+// authenticated data and the en/decrypted data in bytes, respectively.
+//
+// The encryption function then stores the full-length (16-byte) computed
+// authentication tag to |ghash_acc|.  The decryption function instead loads the
+// expected authentication tag (the one that was transmitted) from the 16-byte
+// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
+// computed tag in constant time, and returns true if and only if they match.
+.macro	_aes_gcm_final	enc
+
+	// Function arguments
+	.set	KEY,		%rdi
+	.set	LE_CTR_PTR,	%rsi
+	.set	GHASH_ACC_PTR,	%rdx
+	.set	TOTAL_AADLEN,	%rcx
+	.set	TOTAL_DATALEN,	%r8
+	.set	TAG,		%r9
+	.set	TAGLEN,		%r10d	// Originally at 8(%rsp)
+	.set	TAGLEN64,	%r10
+
+	// Additional local variables.
+	// %rax and %xmm0-%xmm2 are used as temporary registers.
+	.set	AESKEYLEN,	%r11d
+	.set	AESKEYLEN64,	%r11
+	.set	BSWAP_MASK,	%xmm3
+	.set	GHASH_ACC,	%xmm4
+	.set	H_POW1,		%xmm5	// H^1
+	.set	H_POW1_X64,	%xmm6	// H^1 * x^64
+	.set	GFPOLY,		%xmm7
+
+	movdqa		.Lbswap_mask(%rip), BSWAP_MASK
+	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+	// Set up a counter block with 1 in the low 32-bit word.  This is the
+	// counter that produces the ciphertext needed to encrypt the auth tag.
+	movdqu		(LE_CTR_PTR), %xmm0
+	mov		$1, %eax
+	pinsrd		$0, %eax, %xmm0
+
+	// Build the lengths block and XOR it into the GHASH accumulator.
+	movq		TOTAL_DATALEN, GHASH_ACC
+	pinsrq		$1, TOTAL_AADLEN, GHASH_ACC
+	psllq		$3, GHASH_ACC	// Bytes to bits
+	_xor_mem_to_reg	(GHASH_ACC_PTR), GHASH_ACC, %xmm1
+
+	movdqa		OFFSETOF_H_POWERS+7*16(KEY), H_POW1
+	movdqa		OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
+	movq		.Lgfpoly(%rip), GFPOLY
+
+	// Make %rax point to the 6th from last AES round key.  (Using signed
+	// byte offsets -7*16 through 6*16 decreases code size.)
+	lea		(KEY,AESKEYLEN64,4), %rax
+
+	// AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
+	// Interleave the AES and GHASH instructions to improve performance.
+	pshufb		BSWAP_MASK, %xmm0
+	pxor		(KEY), %xmm0
+	cmp		$24, AESKEYLEN
+	jl		128f	// AES-128?
+	je		192f	// AES-192?
+	// AES-256
+	aesenc		-7*16(%rax), %xmm0
+	aesenc		-6*16(%rax), %xmm0
+192:
+	aesenc		-5*16(%rax), %xmm0
+	aesenc		-4*16(%rax), %xmm0
+128:
+.irp i, 0,1,2,3,4,5,6,7,8
+	aesenc		(\i-3)*16(%rax), %xmm0
+	_ghash_mul_step	\i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
+.endr
+	aesenclast	6*16(%rax), %xmm0
+	_ghash_mul_step	9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
+
+	// Undo the byte reflection of the GHASH accumulator.
+	pshufb		BSWAP_MASK, GHASH_ACC
+
+	// Encrypt the GHASH accumulator.
+	pxor		%xmm0, GHASH_ACC
+
+.if \enc
+	// Return the computed auth tag.
+	movdqu		GHASH_ACC, (GHASH_ACC_PTR)
+.else
+	.set		ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!
+
+	// Verify the auth tag in constant time by XOR'ing the transmitted and
+	// computed auth tags together and using the ptest instruction to check
+	// whether the first TAGLEN bytes of the result are zero.
+	_xor_mem_to_reg	(TAG), GHASH_ACC, tmp=%xmm0
+	movl		8(%rsp), TAGLEN
+	lea		.Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR
+	sub		TAGLEN64, ZEROPAD_MASK_PTR
+	xor		%eax, %eax
+	_test_mem	(ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0
+	sete		%al
+.endif
+	RET
+.endm
+
+.set	USE_AVX, 0
+SYM_FUNC_START(aes_gcm_precompute_aesni)
+	_aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_aesni)
+SYM_FUNC_START(aes_gcm_aad_update_aesni)
+	_aes_gcm_aad_update
+SYM_FUNC_END(aes_gcm_aad_update_aesni)
+SYM_FUNC_START(aes_gcm_enc_update_aesni)
+	_aes_gcm_update	1
+SYM_FUNC_END(aes_gcm_enc_update_aesni)
+SYM_FUNC_START(aes_gcm_dec_update_aesni)
+	_aes_gcm_update	0
+SYM_FUNC_END(aes_gcm_dec_update_aesni)
+SYM_FUNC_START(aes_gcm_enc_final_aesni)
+	_aes_gcm_final	1
+SYM_FUNC_END(aes_gcm_enc_final_aesni)
+SYM_FUNC_START(aes_gcm_dec_final_aesni)
+	_aes_gcm_final	0
+SYM_FUNC_END(aes_gcm_dec_final_aesni)
+
+.set	USE_AVX, 1
+SYM_FUNC_START(aes_gcm_precompute_aesni_avx)
+	_aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_aesni_avx)
+SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)
+	_aes_gcm_aad_update
+SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)
+	_aes_gcm_update	1
+SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)
+	_aes_gcm_update	0
+SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)
+SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)
+	_aes_gcm_final	1
+SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)
+SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)
+	_aes_gcm_final	0
+SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)
diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
new file mode 100644
index 000000000000..97e0ee515fc5
--- /dev/null
+++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S
@@ -0,0 +1,1222 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// VAES and VPCLMULQDQ optimized AES-GCM for x86_64
+//
+// Copyright 2024 Google LLC
+//
+// Author: Eric Biggers <ebiggers@google.com>
+//
+//------------------------------------------------------------------------------
+//
+// This file is dual-licensed, meaning that you can use it under your choice of
+// either of the following two licenses:
+//
+// Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
+// of the License at
+//
+//	http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// or
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+//------------------------------------------------------------------------------
+//
+// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
+// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and
+// either AVX512 or AVX10.  Some of the functions, notably the encryption and
+// decryption update functions which are the most performance-critical, are
+// provided in two variants generated from a macro: one using 256-bit vectors
+// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512).  The
+// other, "shared" functions (vaes_avx10) use at most 256-bit vectors.
+//
+// The functions that use 512-bit vectors are intended for CPUs that support
+// 512-bit vectors *and* where using them doesn't cause significant
+// downclocking.  They require the following CPU features:
+//
+//	VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512)
+//
+// The other functions require the following CPU features:
+//
+//	VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256)
+//
+// All functions use the "System V" ABI.  The Windows ABI is not supported.
+//
+// Note that we use "avx10" in the names of the functions as a shorthand to
+// really mean "AVX10 or a certain set of AVX512 features".  Due to Intel's
+// introduction of AVX512 and then its replacement by AVX10, there doesn't seem
+// to be a simple way to name things that makes sense on all CPUs.
+//
+// Note that the macros that support both 256-bit and 512-bit vectors could
+// fairly easily be changed to support 128-bit too.  However, this would *not*
+// be sufficient to allow the code to run on CPUs without AVX512 or AVX10,
+// because the code heavily uses several features of these extensions other than
+// the vector length: the increase in the number of SIMD registers from 16 to
+// 32, masking support, and new instructions such as vpternlogd (which can do a
+// three-argument XOR).  These features are very useful for AES-GCM.
+
+#include <linux/linkage.h>
+
+.section .rodata
+.p2align 6
+
+	// A shuffle mask that reflects the bytes of 16-byte blocks
+.Lbswap_mask:
+	.octa   0x000102030405060708090a0b0c0d0e0f
+
+	// This is the GHASH reducing polynomial without its constant term, i.e.
+	// x^128 + x^7 + x^2 + x, represented using the backwards mapping
+	// between bits and polynomial coefficients.
+	//
+	// Alternatively, it can be interpreted as the naturally-ordered
+	// representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the
+	// "reversed" GHASH reducing polynomial without its x^128 term.
+.Lgfpoly:
+	.octa	0xc2000000000000000000000000000001
+
+	// Same as above, but with the (1 << 64) bit set.
+.Lgfpoly_and_internal_carrybit:
+	.octa	0xc2000000000000010000000000000001
+
+	// The below constants are used for incrementing the counter blocks.
+	// ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
+	// inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
+	// 4.  Note that the same '2' is reused in ctr_pattern and inc_2blocks.
+.Lctr_pattern:
+	.octa	0
+	.octa	1
+.Linc_2blocks:
+	.octa	2
+	.octa	3
+.Linc_4blocks:
+	.octa	4
+
+// Number of powers of the hash key stored in the key struct.  The powers are
+// stored from highest (H^NUM_H_POWERS) to lowest (H^1).
+#define NUM_H_POWERS		16
+
+// Offset to AES key length (in bytes) in the key struct
+#define OFFSETOF_AESKEYLEN	480
+
+// Offset to start of hash key powers array in the key struct
+#define OFFSETOF_H_POWERS	512
+
+// Offset to end of hash key powers array in the key struct.
+//
+// This is immediately followed by three zeroized padding blocks, which are
+// included so that partial vectors can be handled more easily.  E.g. if VL=64
+// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0].  The most
+// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded.
+#define OFFSETOFEND_H_POWERS	(OFFSETOF_H_POWERS + (NUM_H_POWERS * 16))
+
+.text
+
+// Set the vector length in bytes.  This sets the VL variable and defines
+// register aliases V0-V31 that map to the ymm or zmm registers.
+.macro	_set_veclen	vl
+	.set	VL,	\vl
+.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
+	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+.if VL == 32
+	.set	V\i,	%ymm\i
+.elseif VL == 64
+	.set	V\i,	%zmm\i
+.else
+	.error "Unsupported vector length"
+.endif
+.endr
+.endm
+
+// The _ghash_mul_step macro does one step of GHASH multiplication of the
+// 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
+// reduced products in \dst.  \t0, \t1, and \t2 are temporary registers of the
+// same size as \a and \b.  To complete all steps, this must invoked with \i=0
+// through \i=9.  The division into steps allows users of this macro to
+// optionally interleave the computation with other instructions.  Users of this
+// macro must preserve the parameter registers across steps.
+//
+// The multiplications are done in GHASH's representation of the finite field
+// GF(2^128).  Elements of GF(2^128) are represented as binary polynomials
+// (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial
+// G.  The GCM specification uses G = x^128 + x^7 + x^2 + x + 1.  Addition is
+// just XOR, while multiplication is more complex and has two parts: (a) do
+// carryless multiplication of two 128-bit input polynomials to get a 256-bit
+// intermediate product polynomial, and (b) reduce the intermediate product to
+// 128 bits by adding multiples of G that cancel out terms in it.  (Adding
+// multiples of G doesn't change which field element the polynomial represents.)
+//
+// Unfortunately, the GCM specification maps bits to/from polynomial
+// coefficients backwards from the natural order.  In each byte it specifies the
+// highest bit to be the lowest order polynomial coefficient, *not* the highest!
+// This makes it nontrivial to work with the GHASH polynomials.  We could
+// reflect the bits, but x86 doesn't have an instruction that does that.
+//
+// Instead, we operate on the values without bit-reflecting them.  This *mostly*
+// just works, since XOR and carryless multiplication are symmetric with respect
+// to bit order, but it has some consequences.  First, due to GHASH's byte
+// order, by skipping bit reflection, *byte* reflection becomes necessary to
+// give the polynomial terms a consistent order.  E.g., considering an N-bit
+// value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0
+// through N-1 of the byte-reflected value represent the coefficients of x^(N-1)
+// through x^0, whereas bits 0 through N-1 of the non-byte-reflected value
+// represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked
+// with.  Fortunately, x86's vpshufb instruction can do byte reflection.
+//
+// Second, forgoing the bit reflection causes an extra multiple of x (still
+// using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each
+// multiplication.  This is because an M-bit by N-bit carryless multiplication
+// really produces a (M+N-1)-bit product, but in practice it's zero-extended to
+// M+N bits.  In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits
+// to polynomial coefficients backwards, this zero-extension actually changes
+// the product by introducing an extra factor of x.  Therefore, users of this
+// macro must ensure that one of the inputs has an extra factor of x^-1, i.e.
+// the multiplicative inverse of x, to cancel out the extra x.
+//
+// Third, the backwards coefficients convention is just confusing to work with,
+// since it makes "low" and "high" in the polynomial math mean the opposite of
+// their normal meaning in computer programming.  This can be solved by using an
+// alternative interpretation: the polynomial coefficients are understood to be
+// in the natural order, and the multiplication is actually \a * \b * x^-128 mod
+// x^128 + x^127 + x^126 + x^121 + 1.  This doesn't change the inputs, outputs,
+// or the implementation at all; it just changes the mathematical interpretation
+// of what each instruction is doing.  Starting from here, we'll use this
+// alternative interpretation, as it's easier to understand the code that way.
+//
+// Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 =>
+// 128-bit carryless multiplication, so we break the 128 x 128 multiplication
+// into parts as follows (the _L and _H suffixes denote low and high 64 bits):
+//
+//     LO = a_L * b_L
+//     MI = (a_L * b_H) + (a_H * b_L)
+//     HI = a_H * b_H
+//
+// The 256-bit product is x^128*HI + x^64*MI + LO.  LO, MI, and HI are 128-bit.
+// Note that MI "overlaps" with LO and HI.  We don't consolidate MI into LO and
+// HI right away, since the way the reduction works makes that unnecessary.
+//
+// For the reduction, we cancel out the low 128 bits by adding multiples of G =
+// x^128 + x^127 + x^126 + x^121 + 1.  This is done by two iterations, each of
+// which cancels out the next lowest 64 bits.  Consider a value x^64*A + B,
+// where A and B are 128-bit.  Adding B_L*G to that value gives:
+//
+//       x^64*A + B + B_L*G
+//     = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1)
+//     = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L
+//     = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L
+//     = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57))
+//
+// So: if we sum A, B with its halves swapped, and the low half of B times x^63
+// + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the
+// original value x^64*A + B.  I.e., the low 64 bits got canceled out.
+//
+// We just need to apply this twice: first to fold LO into MI, and second to
+// fold the updated MI into HI.
+//
+// The needed three-argument XORs are done using the vpternlogd instruction with
+// immediate 0x96, since this is faster than two vpxord instructions.
+//
+// A potential optimization, assuming that b is fixed per-key (if a is fixed
+// per-key it would work the other way around), is to use one iteration of the
+// reduction described above to precompute a value c such that x^64*c = b mod G,
+// and then multiply a_L by c (and implicitly by x^64) instead of by b:
+//
+//     MI = (a_L * c_L) + (a_H * b_L)
+//     HI = (a_L * c_H) + (a_H * b_H)
+//
+// This would eliminate the LO part of the intermediate product, which would
+// eliminate the need to fold LO into MI.  This would save two instructions,
+// including a vpclmulqdq.  However, we currently don't use this optimization
+// because it would require twice as many per-key precomputed values.
+//
+// Using Karatsuba multiplication instead of "schoolbook" multiplication
+// similarly would save a vpclmulqdq but does not seem to be worth it.
+.macro	_ghash_mul_step	i, a, b, dst, gfpoly, t0, t1, t2
+.if \i == 0
+	vpclmulqdq	$0x00, \a, \b, \t0	  // LO = a_L * b_L
+	vpclmulqdq	$0x01, \a, \b, \t1	  // MI_0 = a_L * b_H
+.elseif \i == 1
+	vpclmulqdq	$0x10, \a, \b, \t2	  // MI_1 = a_H * b_L
+.elseif \i == 2
+	vpxord		\t2, \t1, \t1		  // MI = MI_0 + MI_1
+.elseif \i == 3
+	vpclmulqdq	$0x01, \t0, \gfpoly, \t2  // LO_L*(x^63 + x^62 + x^57)
+.elseif \i == 4
+	vpshufd		$0x4e, \t0, \t0		  // Swap halves of LO
+.elseif \i == 5
+	vpternlogd	$0x96, \t2, \t0, \t1	  // Fold LO into MI
+.elseif \i == 6
+	vpclmulqdq	$0x11, \a, \b, \dst	  // HI = a_H * b_H
+.elseif \i == 7
+	vpclmulqdq	$0x01, \t1, \gfpoly, \t0  // MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+	vpshufd		$0x4e, \t1, \t1		  // Swap halves of MI
+.elseif \i == 9
+	vpternlogd	$0x96, \t0, \t1, \dst	  // Fold MI into HI
+.endif
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
+// the reduced products in \dst.  See _ghash_mul_step for full explanation.
+.macro	_ghash_mul	a, b, dst, gfpoly, t0, t1, t2
+.irp i, 0,1,2,3,4,5,6,7,8,9
+	_ghash_mul_step	\i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2
+.endr
+.endm
+
+// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
+// *unreduced* products to \lo, \mi, and \hi.
+.macro	_ghash_mul_noreduce	a, b, lo, mi, hi, t0, t1, t2, t3
+	vpclmulqdq	$0x00, \a, \b, \t0	// a_L * b_L
+	vpclmulqdq	$0x01, \a, \b, \t1	// a_L * b_H
+	vpclmulqdq	$0x10, \a, \b, \t2	// a_H * b_L
+	vpclmulqdq	$0x11, \a, \b, \t3	// a_H * b_H
+	vpxord		\t0, \lo, \lo
+	vpternlogd	$0x96, \t2, \t1, \mi
+	vpxord		\t3, \hi, \hi
+.endm
+
+// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
+// reduced products in \hi.  See _ghash_mul_step for explanation of reduction.
+.macro	_ghash_reduce	lo, mi, hi, gfpoly, t0
+	vpclmulqdq	$0x01, \lo, \gfpoly, \t0
+	vpshufd		$0x4e, \lo, \lo
+	vpternlogd	$0x96, \t0, \lo, \mi
+	vpclmulqdq	$0x01, \mi, \gfpoly, \t0
+	vpshufd		$0x4e, \mi, \mi
+	vpternlogd	$0x96, \t0, \mi, \hi
+.endm
+
+// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key);
+//
+// Given the expanded AES key |key->aes_key|, this function derives the GHASH
+// subkey and initializes |key->ghash_key_powers| with powers of it.
+//
+// The number of key powers initialized is NUM_H_POWERS, and they are stored in
+// the order H^NUM_H_POWERS to H^1.  The zeroized padding blocks after the key
+// powers themselves are also initialized.
+//
+// This macro supports both VL=32 and VL=64.  _set_veclen must have been invoked
+// with the desired length.  In the VL=32 case, the function computes twice as
+// many key powers than are actually used by the VL=32 GCM update functions.
+// This is done to keep the key format the same regardless of vector length.
+.macro	_aes_gcm_precompute
+
+	// Function arguments
+	.set	KEY,		%rdi
+
+	// Additional local variables.  V0-V2 and %rax are used as temporaries.
+	.set	POWERS_PTR,	%rsi
+	.set	RNDKEYLAST_PTR,	%rdx
+	.set	H_CUR,		V3
+	.set	H_CUR_YMM,	%ymm3
+	.set	H_CUR_XMM,	%xmm3
+	.set	H_INC,		V4
+	.set	H_INC_YMM,	%ymm4
+	.set	H_INC_XMM,	%xmm4
+	.set	GFPOLY,		V5
+	.set	GFPOLY_YMM,	%ymm5
+	.set	GFPOLY_XMM,	%xmm5
+
+	// Get pointer to lowest set of key powers (located at end of array).
+	lea		OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR
+
+	// Encrypt an all-zeroes block to get the raw hash subkey.
+	movl		OFFSETOF_AESKEYLEN(KEY), %eax
+	lea		6*16(KEY,%rax,4), RNDKEYLAST_PTR
+	vmovdqu		(KEY), %xmm0  // Zero-th round key XOR all-zeroes block
+	add		$16, KEY
+1:
+	vaesenc		(KEY), %xmm0, %xmm0
+	add		$16, KEY
+	cmp		KEY, RNDKEYLAST_PTR
+	jne		1b
+	vaesenclast	(RNDKEYLAST_PTR), %xmm0, %xmm0
+
+	// Reflect the bytes of the raw hash subkey.
+	vpshufb		.Lbswap_mask(%rip), %xmm0, H_CUR_XMM
+
+	// Zeroize the padding blocks.
+	vpxor		%xmm0, %xmm0, %xmm0
+	vmovdqu		%ymm0, VL(POWERS_PTR)
+	vmovdqu		%xmm0, VL+2*16(POWERS_PTR)
+
+	// Finish preprocessing the first key power, H^1.  Since this GHASH
+	// implementation operates directly on values with the backwards bit
+	// order specified by the GCM standard, it's necessary to preprocess the
+	// raw key as follows.  First, reflect its bytes.  Second, multiply it
+	// by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards
+	// interpretation of polynomial coefficients), which can also be
+	// interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121
+	// + 1 using the alternative, natural interpretation of polynomial
+	// coefficients.  For details, see the comment above _ghash_mul_step.
+	//
+	// Either way, for the multiplication the concrete operation performed
+	// is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2
+	// << 120) | 1 if a 1 bit was carried out.  However, there's no 128-bit
+	// wide shift instruction, so instead double each of the two 64-bit
+	// halves and incorporate the internal carry bit into the value XOR'd.
+	vpshufd		$0xd3, H_CUR_XMM, %xmm0
+	vpsrad		$31, %xmm0, %xmm0
+	vpaddq		H_CUR_XMM, H_CUR_XMM, H_CUR_XMM
+	vpand		.Lgfpoly_and_internal_carrybit(%rip), %xmm0, %xmm0
+	vpxor		%xmm0, H_CUR_XMM, H_CUR_XMM
+
+	// Load the gfpoly constant.
+	vbroadcasti32x4	.Lgfpoly(%rip), GFPOLY
+
+	// Square H^1 to get H^2.
+	//
+	// Note that as with H^1, all higher key powers also need an extra
+	// factor of x^-1 (or x using the natural interpretation).  Nothing
+	// special needs to be done to make this happen, though: H^1 * H^1 would
+	// end up with two factors of x^-1, but the multiplication consumes one.
+	// So the product H^2 ends up with the desired one factor of x^-1.
+	_ghash_mul	H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \
+			%xmm0, %xmm1, %xmm2
+
+	// Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2].
+	vinserti128	$1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM
+	vinserti128	$1, H_INC_XMM, H_INC_YMM, H_INC_YMM
+
+.if VL == 64
+	// Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4].
+	_ghash_mul	H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \
+			%ymm0, %ymm1, %ymm2
+	vinserti64x4	$1, H_CUR_YMM, H_INC, H_CUR
+	vshufi64x2	$0, H_INC, H_INC, H_INC
+.endif
+
+	// Store the lowest set of key powers.
+	vmovdqu8	H_CUR, (POWERS_PTR)
+
+	// Compute and store the remaining key powers.  With VL=32, repeatedly
+	// multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)].
+	// With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by
+	// [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)].
+	mov		$(NUM_H_POWERS*16/VL) - 1, %eax
+.Lprecompute_next\@:
+	sub		$VL, POWERS_PTR
+	_ghash_mul	H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2
+	vmovdqu8	H_CUR, (POWERS_PTR)
+	dec		%eax
+	jnz		.Lprecompute_next\@
+
+	vzeroupper	// This is needed after using ymm or zmm registers.
+	RET
+.endm
+
+// XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
+// the result in \dst_xmm.  This implicitly zeroizes the other lanes of dst.
+.macro	_horizontal_xor	src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm
+	vextracti32x4	$1, \src, \t0_xmm
+.if VL == 32
+	vpxord		\t0_xmm, \src_xmm, \dst_xmm
+.elseif VL == 64
+	vextracti32x4	$2, \src, \t1_xmm
+	vextracti32x4	$3, \src, \t2_xmm
+	vpxord		\t0_xmm, \src_xmm, \dst_xmm
+	vpternlogd	$0x96, \t1_xmm, \t2_xmm, \dst_xmm
+.else
+	.error "Unsupported vector length"
+.endif
+.endm
+
+// Do one step of the GHASH update of the data blocks given in the vector
+// registers GHASHDATA[0-3].  \i specifies the step to do, 0 through 9.  The
+// division into steps allows users of this macro to optionally interleave the
+// computation with other instructions.  This macro uses the vector register
+// GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered;
+// H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and
+// GHASHTMP[0-2] as temporaries.  This macro handles the byte-reflection of the
+// data blocks.  The parameter registers must be preserved across steps.
+//
+// The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) +
+// H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the
+// operations are vectorized operations on vectors of 16-byte blocks.  E.g.,
+// with VL=32 there are 2 blocks per vector and the vectorized terms correspond
+// to the following non-vectorized terms:
+//
+//	H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0)
+//	H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3
+//	H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5
+//	H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7
+//
+// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15.
+//
+// More concretely, this code does:
+//   - Do vectorized "schoolbook" multiplications to compute the intermediate
+//     256-bit product of each block and its corresponding hash key power.
+//     There are 4*VL/16 of these intermediate products.
+//   - Sum (XOR) the intermediate 256-bit products across vectors.  This leaves
+//     VL/16 256-bit intermediate values.
+//   - Do a vectorized reduction of these 256-bit intermediate values to
+//     128-bits each.  This leaves VL/16 128-bit intermediate values.
+//   - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
+//
+// See _ghash_mul_step for the full explanation of the operations performed for
+// each individual finite field multiplication and reduction.
+.macro	_ghash_step_4x	i
+.if \i == 0
+	vpshufb		BSWAP_MASK, GHASHDATA0, GHASHDATA0
+	vpxord		GHASH_ACC, GHASHDATA0, GHASHDATA0
+	vpshufb		BSWAP_MASK, GHASHDATA1, GHASHDATA1
+	vpshufb		BSWAP_MASK, GHASHDATA2, GHASHDATA2
+.elseif \i == 1
+	vpshufb		BSWAP_MASK, GHASHDATA3, GHASHDATA3
+	vpclmulqdq	$0x00, H_POW4, GHASHDATA0, GHASH_ACC	// LO_0
+	vpclmulqdq	$0x00, H_POW3, GHASHDATA1, GHASHTMP0	// LO_1
+	vpclmulqdq	$0x00, H_POW2, GHASHDATA2, GHASHTMP1	// LO_2
+.elseif \i == 2
+	vpxord		GHASHTMP0, GHASH_ACC, GHASH_ACC		// sum(LO_{1,0})
+	vpclmulqdq	$0x00, H_POW1, GHASHDATA3, GHASHTMP2	// LO_3
+	vpternlogd	$0x96, GHASHTMP2, GHASHTMP1, GHASH_ACC	// LO = sum(LO_{3,2,1,0})
+	vpclmulqdq	$0x01, H_POW4, GHASHDATA0, GHASHTMP0	// MI_0
+.elseif \i == 3
+	vpclmulqdq	$0x01, H_POW3, GHASHDATA1, GHASHTMP1	// MI_1
+	vpclmulqdq	$0x01, H_POW2, GHASHDATA2, GHASHTMP2	// MI_2
+	vpternlogd	$0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0	// sum(MI_{2,1,0})
+	vpclmulqdq	$0x01, H_POW1, GHASHDATA3, GHASHTMP1	// MI_3
+.elseif \i == 4
+	vpclmulqdq	$0x10, H_POW4, GHASHDATA0, GHASHTMP2	// MI_4
+	vpternlogd	$0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0	// sum(MI_{4,3,2,1,0})
+	vpclmulqdq	$0x10, H_POW3, GHASHDATA1, GHASHTMP1	// MI_5
+	vpclmulqdq	$0x10, H_POW2, GHASHDATA2, GHASHTMP2	// MI_6
+.elseif \i == 5
+	vpternlogd	$0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0	// sum(MI_{6,5,4,3,2,1,0})
+	vpclmulqdq	$0x01, GHASH_ACC, GFPOLY, GHASHTMP2	// LO_L*(x^63 + x^62 + x^57)
+	vpclmulqdq	$0x10, H_POW1, GHASHDATA3, GHASHTMP1	// MI_7
+	vpxord		GHASHTMP1, GHASHTMP0, GHASHTMP0		// MI = sum(MI_{7,6,5,4,3,2,1,0})
+.elseif \i == 6
+	vpshufd		$0x4e, GHASH_ACC, GHASH_ACC		// Swap halves of LO
+	vpclmulqdq	$0x11, H_POW4, GHASHDATA0, GHASHDATA0	// HI_0
+	vpclmulqdq	$0x11, H_POW3, GHASHDATA1, GHASHDATA1	// HI_1
+	vpclmulqdq	$0x11, H_POW2, GHASHDATA2, GHASHDATA2	// HI_2
+.elseif \i == 7
+	vpternlogd	$0x96, GHASHTMP2, GHASH_ACC, GHASHTMP0	// Fold LO into MI
+	vpclmulqdq	$0x11, H_POW1, GHASHDATA3, GHASHDATA3	// HI_3
+	vpternlogd	$0x96, GHASHDATA2, GHASHDATA1, GHASHDATA0 // sum(HI_{2,1,0})
+	vpclmulqdq	$0x01, GHASHTMP0, GFPOLY, GHASHTMP1	// MI_L*(x^63 + x^62 + x^57)
+.elseif \i == 8
+	vpxord		GHASHDATA3, GHASHDATA0, GHASH_ACC	// HI = sum(HI_{3,2,1,0})
+	vpshufd		$0x4e, GHASHTMP0, GHASHTMP0		// Swap halves of MI
+	vpternlogd	$0x96, GHASHTMP1, GHASHTMP0, GHASH_ACC	// Fold MI into HI
+.elseif \i == 9
+	_horizontal_xor	GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \
+			GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM
+.endif
+.endm
+
+// Do one non-last round of AES encryption on the counter blocks in V0-V3 using
+// the round key that has been broadcast to all 128-bit lanes of \round_key.
+.macro	_vaesenc_4x	round_key
+	vaesenc		\round_key, V0, V0
+	vaesenc		\round_key, V1, V1
+	vaesenc		\round_key, V2, V2
+	vaesenc		\round_key, V3, V3
+.endm
+
+// Start the AES encryption of four vectors of counter blocks.
+.macro	_ctr_begin_4x
+
+	// Increment LE_CTR four times to generate four vectors of little-endian
+	// counter blocks, swap each to big-endian, and store them in V0-V3.
+	vpshufb		BSWAP_MASK, LE_CTR, V0
+	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
+	vpshufb		BSWAP_MASK, LE_CTR, V1
+	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
+	vpshufb		BSWAP_MASK, LE_CTR, V2
+	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
+	vpshufb		BSWAP_MASK, LE_CTR, V3
+	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
+
+	// AES "round zero": XOR in the zero-th round key.
+	vpxord		RNDKEY0, V0, V0
+	vpxord		RNDKEY0, V1, V1
+	vpxord		RNDKEY0, V2, V2
+	vpxord		RNDKEY0, V3, V3
+.endm
+
+// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key,
+//					  const u32 le_ctr[4], u8 ghash_acc[16],
+//					  const u8 *src, u8 *dst, int datalen);
+//
+// This macro generates a GCM encryption or decryption update function with the
+// above prototype (with \enc selecting which one).  This macro supports both
+// VL=32 and VL=64.  _set_veclen must have been invoked with the desired length.
+//
+// This function computes the next portion of the CTR keystream, XOR's it with
+// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
+// data to |dst|.  It also updates the GHASH accumulator |ghash_acc| using the
+// next |datalen| ciphertext bytes.
+//
+// |datalen| must be a multiple of 16, except on the last call where it can be
+// any length.  The caller must do any buffering needed to ensure this.  Both
+// in-place and out-of-place en/decryption are supported.
+//
+// |le_ctr| must give the current counter in little-endian format.  For a new
+// message, the low word of the counter must be 2.  This function loads the
+// counter from |le_ctr| and increments the loaded counter as needed, but it
+// does *not* store the updated counter back to |le_ctr|.  The caller must
+// update |le_ctr| if any more data segments follow.  Internally, only the low
+// 32-bit word of the counter is incremented, following the GCM standard.
+.macro	_aes_gcm_update	enc
+
+	// Function arguments
+	.set	KEY,		%rdi
+	.set	LE_CTR_PTR,	%rsi
+	.set	GHASH_ACC_PTR,	%rdx
+	.set	SRC,		%rcx
+	.set	DST,		%r8
+	.set	DATALEN,	%r9d
+	.set	DATALEN64,	%r9	// Zero-extend DATALEN before using!
+
+	// Additional local variables
+
+	// %rax and %k1 are used as temporary registers.  LE_CTR_PTR is also
+	// available as a temporary register after the counter is loaded.
+
+	// AES key length in bytes
+	.set	AESKEYLEN,	%r10d
+	.set	AESKEYLEN64,	%r10
+
+	// Pointer to the last AES round key for the chosen AES variant
+	.set	RNDKEYLAST_PTR,	%r11
+
+	// In the main loop, V0-V3 are used as AES input and output.  Elsewhere
+	// they are used as temporary registers.
+
+	// GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data.
+	.set	GHASHDATA0,	V4
+	.set	GHASHDATA0_XMM,	%xmm4
+	.set	GHASHDATA1,	V5
+	.set	GHASHDATA1_XMM,	%xmm5
+	.set	GHASHDATA2,	V6
+	.set	GHASHDATA2_XMM,	%xmm6
+	.set	GHASHDATA3,	V7
+
+	// BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
+	// using vpshufb, copied to all 128-bit lanes.
+	.set	BSWAP_MASK,	V8
+
+	// RNDKEY temporarily holds the next AES round key.
+	.set	RNDKEY,		V9
+
+	// GHASH_ACC is the accumulator variable for GHASH.  When fully reduced,
+	// only the lowest 128-bit lane can be nonzero.  When not fully reduced,
+	// more than one lane may be used, and they need to be XOR'd together.
+	.set	GHASH_ACC,	V10
+	.set	GHASH_ACC_XMM,	%xmm10
+
+	// LE_CTR_INC is the vector of 32-bit words that need to be added to a
+	// vector of little-endian counter blocks to advance it forwards.
+	.set	LE_CTR_INC,	V11
+
+	// LE_CTR contains the next set of little-endian counter blocks.
+	.set	LE_CTR,		V12
+
+	// RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys,
+	// copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
+	// RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last.
+	.set	RNDKEY0,	V13
+	.set	RNDKEYLAST,	V14
+	.set	RNDKEY_M9,	V15
+	.set	RNDKEY_M8,	V16
+	.set	RNDKEY_M7,	V17
+	.set	RNDKEY_M6,	V18
+	.set	RNDKEY_M5,	V19
+
+	// RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with
+	// the corresponding block of source data.  This is useful because
+	// vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can
+	// be computed in parallel with the AES rounds.
+	.set	RNDKEYLAST0,	V20
+	.set	RNDKEYLAST1,	V21
+	.set	RNDKEYLAST2,	V22
+	.set	RNDKEYLAST3,	V23
+
+	// GHASHTMP[0-2] are temporary variables used by _ghash_step_4x.  These
+	// cannot coincide with anything used for AES encryption, since for
+	// performance reasons GHASH and AES encryption are interleaved.
+	.set	GHASHTMP0,	V24
+	.set	GHASHTMP1,	V25
+	.set	GHASHTMP2,	V26
+
+	// H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1.  The
+	// descending numbering reflects the order of the key powers.
+	.set	H_POW4,		V27
+	.set	H_POW3,		V28
+	.set	H_POW2,		V29
+	.set	H_POW1,		V30
+
+	// GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
+	.set	GFPOLY,		V31
+
+	// Load some constants.
+	vbroadcasti32x4	.Lbswap_mask(%rip), BSWAP_MASK
+	vbroadcasti32x4	.Lgfpoly(%rip), GFPOLY
+
+	// Load the GHASH accumulator and the starting counter.
+	vmovdqu		(GHASH_ACC_PTR), GHASH_ACC_XMM
+	vbroadcasti32x4	(LE_CTR_PTR), LE_CTR
+
+	// Load the AES key length in bytes.
+	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+	// Make RNDKEYLAST_PTR point to the last AES round key.  This is the
+	// round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
+	// respectively.  Then load the zero-th and last round keys.
+	lea		6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
+	vbroadcasti32x4	(KEY), RNDKEY0
+	vbroadcasti32x4	(RNDKEYLAST_PTR), RNDKEYLAST
+
+	// Finish initializing LE_CTR by adding [0, 1, ...] to its low words.
+	vpaddd		.Lctr_pattern(%rip), LE_CTR, LE_CTR
+
+	// Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
+.if VL == 32
+	vbroadcasti32x4	.Linc_2blocks(%rip), LE_CTR_INC
+.elseif VL == 64
+	vbroadcasti32x4	.Linc_4blocks(%rip), LE_CTR_INC
+.else
+	.error "Unsupported vector length"
+.endif
+
+	// If there are at least 4*VL bytes of data, then continue into the loop
+	// that processes 4*VL bytes of data at a time.  Otherwise skip it.
+	//
+	// Pre-subtracting 4*VL from DATALEN saves an instruction from the main
+	// loop and also ensures that at least one write always occurs to
+	// DATALEN, zero-extending it and allowing DATALEN64 to be used later.
+	sub		$4*VL, DATALEN
+	jl		.Lcrypt_loop_4x_done\@
+
+	// Load powers of the hash key.
+	vmovdqu8	OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4
+	vmovdqu8	OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3
+	vmovdqu8	OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2
+	vmovdqu8	OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1
+
+	// Main loop: en/decrypt and hash 4 vectors at a time.
+	//
+	// When possible, interleave the AES encryption of the counter blocks
+	// with the GHASH update of the ciphertext blocks.  This improves
+	// performance on many CPUs because the execution ports used by the VAES
+	// instructions often differ from those used by vpclmulqdq and other
+	// instructions used in GHASH.  For example, many Intel CPUs dispatch
+	// vaesenc to ports 0 and 1 and vpclmulqdq to port 5.
+	//
+	// The interleaving is easiest to do during decryption, since during
+	// decryption the ciphertext blocks are immediately available.  For
+	// encryption, instead encrypt the first set of blocks, then hash those
+	// blocks while encrypting the next set of blocks, repeat that as
+	// needed, and finally hash the last set of blocks.
+
+.if \enc
+	// Encrypt the first 4 vectors of plaintext blocks.  Leave the resulting
+	// ciphertext in GHASHDATA[0-3] for GHASH.
+	_ctr_begin_4x
+	lea		16(KEY), %rax
+1:
+	vbroadcasti32x4	(%rax), RNDKEY
+	_vaesenc_4x	RNDKEY
+	add		$16, %rax
+	cmp		%rax, RNDKEYLAST_PTR
+	jne		1b
+	vpxord		0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
+	vpxord		1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
+	vpxord		2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
+	vpxord		3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
+	vaesenclast	RNDKEYLAST0, V0, GHASHDATA0
+	vaesenclast	RNDKEYLAST1, V1, GHASHDATA1
+	vaesenclast	RNDKEYLAST2, V2, GHASHDATA2
+	vaesenclast	RNDKEYLAST3, V3, GHASHDATA3
+	vmovdqu8	GHASHDATA0, 0*VL(DST)
+	vmovdqu8	GHASHDATA1, 1*VL(DST)
+	vmovdqu8	GHASHDATA2, 2*VL(DST)
+	vmovdqu8	GHASHDATA3, 3*VL(DST)
+	add		$4*VL, SRC
+	add		$4*VL, DST
+	sub		$4*VL, DATALEN
+	jl		.Lghash_last_ciphertext_4x\@
+.endif
+
+	// Cache as many additional AES round keys as possible.
+.irp i, 9,8,7,6,5
+	vbroadcasti32x4	-\i*16(RNDKEYLAST_PTR), RNDKEY_M\i
+.endr
+
+.Lcrypt_loop_4x\@:
+
+	// If decrypting, load more ciphertext blocks into GHASHDATA[0-3].  If
+	// encrypting, GHASHDATA[0-3] already contain the previous ciphertext.
+.if !\enc
+	vmovdqu8	0*VL(SRC), GHASHDATA0
+	vmovdqu8	1*VL(SRC), GHASHDATA1
+	vmovdqu8	2*VL(SRC), GHASHDATA2
+	vmovdqu8	3*VL(SRC), GHASHDATA3
+.endif
+
+	// Start the AES encryption of the counter blocks.
+	_ctr_begin_4x
+	cmp		$24, AESKEYLEN
+	jl		128f	// AES-128?
+	je		192f	// AES-192?
+	// AES-256
+	vbroadcasti32x4	-13*16(RNDKEYLAST_PTR), RNDKEY
+	_vaesenc_4x	RNDKEY
+	vbroadcasti32x4	-12*16(RNDKEYLAST_PTR), RNDKEY
+	_vaesenc_4x	RNDKEY
+192:
+	vbroadcasti32x4	-11*16(RNDKEYLAST_PTR), RNDKEY
+	_vaesenc_4x	RNDKEY
+	vbroadcasti32x4	-10*16(RNDKEYLAST_PTR), RNDKEY
+	_vaesenc_4x	RNDKEY
+128:
+
+	// XOR the source data with the last round key, saving the result in
+	// RNDKEYLAST[0-3].  This reduces latency by taking advantage of the
+	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
+.if \enc
+	vpxord		0*VL(SRC), RNDKEYLAST, RNDKEYLAST0
+	vpxord		1*VL(SRC), RNDKEYLAST, RNDKEYLAST1
+	vpxord		2*VL(SRC), RNDKEYLAST, RNDKEYLAST2
+	vpxord		3*VL(SRC), RNDKEYLAST, RNDKEYLAST3
+.else
+	vpxord		GHASHDATA0, RNDKEYLAST, RNDKEYLAST0
+	vpxord		GHASHDATA1, RNDKEYLAST, RNDKEYLAST1
+	vpxord		GHASHDATA2, RNDKEYLAST, RNDKEYLAST2
+	vpxord		GHASHDATA3, RNDKEYLAST, RNDKEYLAST3
+.endif
+
+	// Finish the AES encryption of the counter blocks in V0-V3, interleaved
+	// with the GHASH update of the ciphertext blocks in GHASHDATA[0-3].
+.irp i, 9,8,7,6,5
+	_vaesenc_4x	RNDKEY_M\i
+	_ghash_step_4x	(9 - \i)
+.endr
+.irp i, 4,3,2,1
+	vbroadcasti32x4	-\i*16(RNDKEYLAST_PTR), RNDKEY
+	_vaesenc_4x	RNDKEY
+	_ghash_step_4x	(9 - \i)
+.endr
+	_ghash_step_4x	9
+
+	// Do the last AES round.  This handles the XOR with the source data
+	// too, as per the optimization described above.
+	vaesenclast	RNDKEYLAST0, V0, GHASHDATA0
+	vaesenclast	RNDKEYLAST1, V1, GHASHDATA1
+	vaesenclast	RNDKEYLAST2, V2, GHASHDATA2
+	vaesenclast	RNDKEYLAST3, V3, GHASHDATA3
+
+	// Store the en/decrypted data to DST.
+	vmovdqu8	GHASHDATA0, 0*VL(DST)
+	vmovdqu8	GHASHDATA1, 1*VL(DST)
+	vmovdqu8	GHASHDATA2, 2*VL(DST)
+	vmovdqu8	GHASHDATA3, 3*VL(DST)
+
+	add		$4*VL, SRC
+	add		$4*VL, DST
+	sub		$4*VL, DATALEN
+	jge		.Lcrypt_loop_4x\@
+
+.if \enc
+.Lghash_last_ciphertext_4x\@:
+	// Update GHASH with the last set of ciphertext blocks.
+.irp i, 0,1,2,3,4,5,6,7,8,9
+	_ghash_step_4x	\i
+.endr
+.endif
+
+.Lcrypt_loop_4x_done\@:
+
+	// Undo the extra subtraction by 4*VL and check whether data remains.
+	add		$4*VL, DATALEN
+	jz		.Ldone\@
+
+	// The data length isn't a multiple of 4*VL.  Process the remaining data
+	// of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time.
+	// Going one vector at a time may seem inefficient compared to having
+	// separate code paths for each possible number of vectors remaining.
+	// However, using a loop keeps the code size down, and it performs
+	// surprising well; modern CPUs will start executing the next iteration
+	// before the previous one finishes and also predict the number of loop
+	// iterations.  For a similar reason, we roll up the AES rounds.
+	//
+	// On the last iteration, the remaining length may be less than VL.
+	// Handle this using masking.
+	//
+	// Since there are enough key powers available for all remaining data,
+	// there is no need to do a GHASH reduction after each iteration.
+	// Instead, multiply each remaining block by its own key power, and only
+	// do a GHASH reduction at the very end.
+
+	// Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N
+	// is the number of blocks that remain.
+	.set		POWERS_PTR, LE_CTR_PTR	// LE_CTR_PTR is free to be reused.
+	mov		DATALEN, %eax
+	neg		%rax
+	and		$~15, %rax  // -round_up(DATALEN, 16)
+	lea		OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR
+
+	// Start collecting the unreduced GHASH intermediate value LO, MI, HI.
+	.set		LO, GHASHDATA0
+	.set		LO_XMM, GHASHDATA0_XMM
+	.set		MI, GHASHDATA1
+	.set		MI_XMM, GHASHDATA1_XMM
+	.set		HI, GHASHDATA2
+	.set		HI_XMM, GHASHDATA2_XMM
+	vpxor		LO_XMM, LO_XMM, LO_XMM
+	vpxor		MI_XMM, MI_XMM, MI_XMM
+	vpxor		HI_XMM, HI_XMM, HI_XMM
+
+.Lcrypt_loop_1x\@:
+
+	// Select the appropriate mask for this iteration: all 1's if
+	// DATALEN >= VL, otherwise DATALEN 1's.  Do this branchlessly using the
+	// bzhi instruction from BMI2.  (This relies on DATALEN <= 255.)
+.if VL < 64
+	mov		$-1, %eax
+	bzhi		DATALEN, %eax, %eax
+	kmovd		%eax, %k1
+.else
+	mov		$-1, %rax
+	bzhi		DATALEN64, %rax, %rax
+	kmovq		%rax, %k1
+.endif
+
+	// Encrypt a vector of counter blocks.  This does not need to be masked.
+	vpshufb		BSWAP_MASK, LE_CTR, V0
+	vpaddd		LE_CTR_INC, LE_CTR, LE_CTR
+	vpxord		RNDKEY0, V0, V0
+	lea		16(KEY), %rax
+1:
+	vbroadcasti32x4	(%rax), RNDKEY
+	vaesenc		RNDKEY, V0, V0
+	add		$16, %rax
+	cmp		%rax, RNDKEYLAST_PTR
+	jne		1b
+	vaesenclast	RNDKEYLAST, V0, V0
+
+	// XOR the data with the appropriate number of keystream bytes.
+	vmovdqu8	(SRC), V1{%k1}{z}
+	vpxord		V1, V0, V0
+	vmovdqu8	V0, (DST){%k1}
+
+	// Update GHASH with the ciphertext block(s), without reducing.
+	//
+	// In the case of DATALEN < VL, the ciphertext is zero-padded to VL.
+	// (If decrypting, it's done by the above masked load.  If encrypting,
+	// it's done by the below masked register-to-register move.)  Note that
+	// if DATALEN <= VL - 16, there will be additional padding beyond the
+	// padding of the last block specified by GHASH itself; i.e., there may
+	// be whole block(s) that get processed by the GHASH multiplication and
+	// reduction instructions but should not actually be included in the
+	// GHASH.  However, any such blocks are all-zeroes, and the values that
+	// they're multiplied with are also all-zeroes.  Therefore they just add
+	// 0 * 0 = 0 to the final GHASH result, which makes no difference.
+	vmovdqu8        (POWERS_PTR), H_POW1
+.if \enc
+	vmovdqu8	V0, V1{%k1}{z}
+.endif
+	vpshufb		BSWAP_MASK, V1, V0
+	vpxord		GHASH_ACC, V0, V0
+	_ghash_mul_noreduce	H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3
+	vpxor		GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM
+
+	add		$VL, POWERS_PTR
+	add		$VL, SRC
+	add		$VL, DST
+	sub		$VL, DATALEN
+	jg		.Lcrypt_loop_1x\@
+
+	// Finally, do the GHASH reduction.
+	_ghash_reduce	LO, MI, HI, GFPOLY, V0
+	_horizontal_xor	HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2
+
+.Ldone\@:
+	// Store the updated GHASH accumulator back to memory.
+	vmovdqu		GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+	vzeroupper	// This is needed after using ymm or zmm registers.
+	RET
+.endm
+
+// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+//				     const u32 le_ctr[4], u8 ghash_acc[16],
+//				     u64 total_aadlen, u64 total_datalen);
+// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+//				     const u32 le_ctr[4],
+//				     const u8 ghash_acc[16],
+//				     u64 total_aadlen, u64 total_datalen,
+//				     const u8 tag[16], int taglen);
+//
+// This macro generates one of the above two functions (with \enc selecting
+// which one).  Both functions finish computing the GCM authentication tag by
+// updating GHASH with the lengths block and encrypting the GHASH accumulator.
+// |total_aadlen| and |total_datalen| must be the total length of the additional
+// authenticated data and the en/decrypted data in bytes, respectively.
+//
+// The encryption function then stores the full-length (16-byte) computed
+// authentication tag to |ghash_acc|.  The decryption function instead loads the
+// expected authentication tag (the one that was transmitted) from the 16-byte
+// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
+// computed tag in constant time, and returns true if and only if they match.
+.macro	_aes_gcm_final	enc
+
+	// Function arguments
+	.set	KEY,		%rdi
+	.set	LE_CTR_PTR,	%rsi
+	.set	GHASH_ACC_PTR,	%rdx
+	.set	TOTAL_AADLEN,	%rcx
+	.set	TOTAL_DATALEN,	%r8
+	.set	TAG,		%r9
+	.set	TAGLEN,		%r10d	// Originally at 8(%rsp)
+
+	// Additional local variables.
+	// %rax, %xmm0-%xmm3, and %k1 are used as temporary registers.
+	.set	AESKEYLEN,	%r11d
+	.set	AESKEYLEN64,	%r11
+	.set	GFPOLY,		%xmm4
+	.set	BSWAP_MASK,	%xmm5
+	.set	LE_CTR,		%xmm6
+	.set	GHASH_ACC,	%xmm7
+	.set	H_POW1,		%xmm8
+
+	// Load some constants.
+	vmovdqa		.Lgfpoly(%rip), GFPOLY
+	vmovdqa		.Lbswap_mask(%rip), BSWAP_MASK
+
+	// Load the AES key length in bytes.
+	movl		OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
+
+	// Set up a counter block with 1 in the low 32-bit word.  This is the
+	// counter that produces the ciphertext needed to encrypt the auth tag.
+	// GFPOLY has 1 in the low word, so grab the 1 from there using a blend.
+	vpblendd	$0xe, (LE_CTR_PTR), GFPOLY, LE_CTR
+
+	// Build the lengths block and XOR it with the GHASH accumulator.
+	// Although the lengths block is defined as the AAD length followed by
+	// the en/decrypted data length, both in big-endian byte order, a byte
+	// reflection of the full block is needed because of the way we compute
+	// GHASH (see _ghash_mul_step).  By using little-endian values in the
+	// opposite order, we avoid having to reflect any bytes here.
+	vmovq		TOTAL_DATALEN, %xmm0
+	vpinsrq		$1, TOTAL_AADLEN, %xmm0, %xmm0
+	vpsllq		$3, %xmm0, %xmm0	// Bytes to bits
+	vpxor		(GHASH_ACC_PTR), %xmm0, GHASH_ACC
+
+	// Load the first hash key power (H^1), which is stored last.
+	vmovdqu8	OFFSETOFEND_H_POWERS-16(KEY), H_POW1
+
+.if !\enc
+	// Prepare a mask of TAGLEN one bits.
+	movl		8(%rsp), TAGLEN
+	mov		$-1, %eax
+	bzhi		TAGLEN, %eax, %eax
+	kmovd		%eax, %k1
+.endif
+
+	// Make %rax point to the last AES round key for the chosen AES variant.
+	lea		6*16(KEY,AESKEYLEN64,4), %rax
+
+	// Start the AES encryption of the counter block by swapping the counter
+	// block to big-endian and XOR-ing it with the zero-th AES round key.
+	vpshufb		BSWAP_MASK, LE_CTR, %xmm0
+	vpxor		(KEY), %xmm0, %xmm0
+
+	// Complete the AES encryption and multiply GHASH_ACC by H^1.
+	// Interleave the AES and GHASH instructions to improve performance.
+	cmp		$24, AESKEYLEN
+	jl		128f	// AES-128?
+	je		192f	// AES-192?
+	// AES-256
+	vaesenc		-13*16(%rax), %xmm0, %xmm0
+	vaesenc		-12*16(%rax), %xmm0, %xmm0
+192:
+	vaesenc		-11*16(%rax), %xmm0, %xmm0
+	vaesenc		-10*16(%rax), %xmm0, %xmm0
+128:
+.irp i, 0,1,2,3,4,5,6,7,8
+	_ghash_mul_step	\i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+			%xmm1, %xmm2, %xmm3
+	vaesenc		(\i-9)*16(%rax), %xmm0, %xmm0
+.endr
+	_ghash_mul_step	9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+			%xmm1, %xmm2, %xmm3
+
+	// Undo the byte reflection of the GHASH accumulator.
+	vpshufb		BSWAP_MASK, GHASH_ACC, GHASH_ACC
+
+	// Do the last AES round and XOR the resulting keystream block with the
+	// GHASH accumulator to produce the full computed authentication tag.
+	//
+	// Reduce latency by taking advantage of the property vaesenclast(key,
+	// a) ^ b == vaesenclast(key ^ b, a).  I.e., XOR GHASH_ACC into the last
+	// round key, instead of XOR'ing the final AES output with GHASH_ACC.
+	//
+	// enc_final then returns the computed auth tag, while dec_final
+	// compares it with the transmitted one and returns a bool.  To compare
+	// the tags, dec_final XORs them together and uses vptest to check
+	// whether the result is all-zeroes.  This should be constant-time.
+	// dec_final applies the vaesenclast optimization to this additional
+	// value XOR'd too, using vpternlogd to XOR the last round key, GHASH
+	// accumulator, and transmitted auth tag together in one instruction.
+.if \enc
+	vpxor		(%rax), GHASH_ACC, %xmm1
+	vaesenclast	%xmm1, %xmm0, GHASH_ACC
+	vmovdqu		GHASH_ACC, (GHASH_ACC_PTR)
+.else
+	vmovdqu		(TAG), %xmm1
+	vpternlogd	$0x96, (%rax), GHASH_ACC, %xmm1
+	vaesenclast	%xmm1, %xmm0, %xmm0
+	xor		%eax, %eax
+	vmovdqu8	%xmm0, %xmm0{%k1}{z}	// Truncate to TAGLEN bytes
+	vptest		%xmm0, %xmm0
+	sete		%al
+.endif
+	// No need for vzeroupper here, since only used xmm registers were used.
+	RET
+.endm
+
+_set_veclen 32
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256)
+	_aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256)
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256)
+	_aes_gcm_update	1
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256)
+	_aes_gcm_update	0
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256)
+
+_set_veclen 64
+SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512)
+	_aes_gcm_precompute
+SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512)
+SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512)
+	_aes_gcm_update	1
+SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512)
+SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512)
+	_aes_gcm_update	0
+SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512)
+
+// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+//				      u8 ghash_acc[16],
+//				      const u8 *aad, int aadlen);
+//
+// This function processes the AAD (Additional Authenticated Data) in GCM.
+// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
+// data given by |aad| and |aadlen|.  |key->ghash_key_powers| must have been
+// initialized.  On the first call, |ghash_acc| must be all zeroes.  |aadlen|
+// must be a multiple of 16, except on the last call where it can be any length.
+// The caller must do any buffering needed to ensure this.
+//
+// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes.
+// Therefore, for AAD processing we currently only provide this implementation
+// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop.  This
+// keeps the code size down, and it enables some micro-optimizations, e.g. using
+// VEX-coded instructions instead of EVEX-coded to save some instruction bytes.
+// To optimize for large amounts of AAD, we could implement a 4x-wide loop and
+// provide a version using 512-bit vectors, but that doesn't seem to be useful.
+SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10)
+
+	// Function arguments
+	.set	KEY,		%rdi
+	.set	GHASH_ACC_PTR,	%rsi
+	.set	AAD,		%rdx
+	.set	AADLEN,		%ecx
+	.set	AADLEN64,	%rcx	// Zero-extend AADLEN before using!
+
+	// Additional local variables.
+	// %rax, %ymm0-%ymm3, and %k1 are used as temporary registers.
+	.set	BSWAP_MASK,	%ymm4
+	.set	GFPOLY,		%ymm5
+	.set	GHASH_ACC,	%ymm6
+	.set	GHASH_ACC_XMM,	%xmm6
+	.set	H_POW1,		%ymm7
+
+	// Load some constants.
+	vbroadcasti128	.Lbswap_mask(%rip), BSWAP_MASK
+	vbroadcasti128	.Lgfpoly(%rip), GFPOLY
+
+	// Load the GHASH accumulator.
+	vmovdqu		(GHASH_ACC_PTR), GHASH_ACC_XMM
+
+	// Update GHASH with 32 bytes of AAD at a time.
+	//
+	// Pre-subtracting 32 from AADLEN saves an instruction from the loop and
+	// also ensures that at least one write always occurs to AADLEN,
+	// zero-extending it and allowing AADLEN64 to be used later.
+	sub		$32, AADLEN
+	jl		.Laad_loop_1x_done
+	vmovdqu8	OFFSETOFEND_H_POWERS-32(KEY), H_POW1	// [H^2, H^1]
+.Laad_loop_1x:
+	vmovdqu		(AAD), %ymm0
+	vpshufb		BSWAP_MASK, %ymm0, %ymm0
+	vpxor		%ymm0, GHASH_ACC, GHASH_ACC
+	_ghash_mul	H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+			%ymm0, %ymm1, %ymm2
+	vextracti128	$1, GHASH_ACC, %xmm0
+	vpxor		%xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
+	add		$32, AAD
+	sub		$32, AADLEN
+	jge		.Laad_loop_1x
+.Laad_loop_1x_done:
+	add		$32, AADLEN
+	jz		.Laad_done
+
+	// Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD.
+	mov		$-1, %eax
+	bzhi		AADLEN, %eax, %eax
+	kmovd		%eax, %k1
+	vmovdqu8	(AAD), %ymm0{%k1}{z}
+	neg		AADLEN64
+	and		$~15, AADLEN64  // -round_up(AADLEN, 16)
+	vmovdqu8	OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1
+	vpshufb		BSWAP_MASK, %ymm0, %ymm0
+	vpxor		%ymm0, GHASH_ACC, GHASH_ACC
+	_ghash_mul	H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \
+			%ymm0, %ymm1, %ymm2
+	vextracti128	$1, GHASH_ACC, %xmm0
+	vpxor		%xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM
+
+.Laad_done:
+	// Store the updated GHASH accumulator back to memory.
+	vmovdqu		GHASH_ACC_XMM, (GHASH_ACC_PTR)
+
+	vzeroupper	// This is needed after using ymm or zmm registers.
+	RET
+SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10)
+
+SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10)
+	_aes_gcm_final	1
+SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10)
+SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10)
+	_aes_gcm_final	0
+SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10)
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 39066b57a70e..eb153eff9331 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -10,16 +10,7 @@
  *            Vinodh Gopal <vinodh.gopal@intel.com>
  *            Kahraman Akdemir
  *
- * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
- * interface for 64-bit kernels.
- *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
- *             Aidan O'Mahony (aidan.o.mahony@intel.com)
- *             Adrian Hoban <adrian.hoban@intel.com>
- *             James Guilford (james.guilford@intel.com)
- *             Gabriele Paoloni <gabriele.paoloni@intel.com>
- *             Tadeusz Struk (tadeusz.struk@intel.com)
- *             Wajdi Feghali (wajdi.k.feghali@intel.com)
- *    Copyright (c) 2010, Intel Corporation.
+ * Copyright (c) 2010, Intel Corporation.
  *
  * Ported x86_64 version to x86:
  *    Author: Mathias Krause <minipli@googlemail.com>
@@ -27,95 +18,6 @@
 
 #include <linux/linkage.h>
 #include <asm/frame.h>
-#include <asm/nospec-branch.h>
-
-/*
- * The following macros are used to move an (un)aligned 16 byte value to/from
- * an XMM register.  This can done for either FP or integer values, for FP use
- * movaps (move aligned packed single) or integer use movdqa (move double quad
- * aligned).  It doesn't make a performance difference which instruction is used
- * since Nehalem (original Core i7) was released.  However, the movaps is a byte
- * shorter, so that is the one we'll use for now. (same for unaligned).
- */
-#define MOVADQ	movaps
-#define MOVUDQ	movups
-
-#ifdef __x86_64__
-
-# constants in mergeable sections, linker can reorder and merge
-.section	.rodata.cst16.POLY, "aM", @progbits, 16
-.align 16
-POLY:   .octa 0xC2000000000000000000000000000001
-.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
-.align 16
-TWOONE: .octa 0x00000001000000000000000000000001
-
-.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
-.align 16
-SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
-.section	.rodata.cst16.MASK1, "aM", @progbits, 16
-.align 16
-MASK1:      .octa 0x0000000000000000ffffffffffffffff
-.section	.rodata.cst16.MASK2, "aM", @progbits, 16
-.align 16
-MASK2:      .octa 0xffffffffffffffff0000000000000000
-.section	.rodata.cst16.ONE, "aM", @progbits, 16
-.align 16
-ONE:        .octa 0x00000000000000000000000000000001
-.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
-.align 16
-F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
-.section	.rodata.cst16.dec, "aM", @progbits, 16
-.align 16
-dec:        .octa 0x1
-.section	.rodata.cst16.enc, "aM", @progbits, 16
-.align 16
-enc:        .octa 0x2
-
-# order of these constants should not change.
-# more specifically, ALL_F should follow SHIFT_MASK,
-# and zero should follow ALL_F
-.section	.rodata, "a", @progbits
-.align 16
-SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
-ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
-            .octa 0x00000000000000000000000000000000
-
-.text
-
-#define AadHash 16*0
-#define AadLen 16*1
-#define InLen (16*1)+8
-#define PBlockEncKey 16*2
-#define OrigIV 16*3
-#define CurCount 16*4
-#define PBlockLen 16*5
-#define	HashKey		16*6	// store HashKey <<1 mod poly here
-#define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
-#define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
-#define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
-#define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
-				// bits of  HashKey <<1 mod poly here
-				//(for Karatsuba purposes)
-#define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
-				// bits of  HashKey^2 <<1 mod poly here
-				// (for Karatsuba purposes)
-#define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
-				// bits of  HashKey^3 <<1 mod poly here
-				// (for Karatsuba purposes)
-#define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
-				// bits of  HashKey^4 <<1 mod poly here
-				// (for Karatsuba purposes)
-
-#define arg1 rdi
-#define arg2 rsi
-#define arg3 rdx
-#define arg4 rcx
-#define arg5 r8
-#define arg6 r9
-#define keysize 2*15*16(%arg1)
-#endif
-
 
 #define STATE1	%xmm0
 #define STATE2	%xmm4
@@ -162,1409 +64,6 @@ ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
 #define TKEYP	T1
 #endif
 
-.macro FUNC_SAVE
-	push	%r12
-	push	%r13
-	push	%r14
-#
-# states of %xmm registers %xmm6:%xmm15 not saved
-# all %xmm registers are clobbered
-#
-.endm
-
-
-.macro FUNC_RESTORE
-	pop	%r14
-	pop	%r13
-	pop	%r12
-.endm
-
-# Precompute hashkeys.
-# Input: Hash subkey.
-# Output: HashKeys stored in gcm_context_data.  Only needs to be called
-# once per key.
-# clobbers r12, and tmp xmm registers.
-.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
-	mov	\SUBKEY, %r12
-	movdqu	(%r12), \TMP3
-	movdqa	SHUF_MASK(%rip), \TMP2
-	pshufb	\TMP2, \TMP3
-
-	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
-
-	movdqa	\TMP3, \TMP2
-	psllq	$1, \TMP3
-	psrlq	$63, \TMP2
-	movdqa	\TMP2, \TMP1
-	pslldq	$8, \TMP2
-	psrldq	$8, \TMP1
-	por	\TMP2, \TMP3
-
-	# reduce HashKey<<1
-
-	pshufd	$0x24, \TMP1, \TMP2
-	pcmpeqd TWOONE(%rip), \TMP2
-	pand	POLY(%rip), \TMP2
-	pxor	\TMP2, \TMP3
-	movdqu	\TMP3, HashKey(%arg2)
-
-	movdqa	   \TMP3, \TMP5
-	pshufd	   $78, \TMP3, \TMP1
-	pxor	   \TMP3, \TMP1
-	movdqu	   \TMP1, HashKey_k(%arg2)
-
-	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^2<<1 (mod poly)
-	movdqu	   \TMP5, HashKey_2(%arg2)
-# HashKey_2 = HashKey^2<<1 (mod poly)
-	pshufd	   $78, \TMP5, \TMP1
-	pxor	   \TMP5, \TMP1
-	movdqu	   \TMP1, HashKey_2_k(%arg2)
-
-	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^3<<1 (mod poly)
-	movdqu	   \TMP5, HashKey_3(%arg2)
-	pshufd	   $78, \TMP5, \TMP1
-	pxor	   \TMP5, \TMP1
-	movdqu	   \TMP1, HashKey_3_k(%arg2)
-
-	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^3<<1 (mod poly)
-	movdqu	   \TMP5, HashKey_4(%arg2)
-	pshufd	   $78, \TMP5, \TMP1
-	pxor	   \TMP5, \TMP1
-	movdqu	   \TMP1, HashKey_4_k(%arg2)
-.endm
-
-# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
-# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
-.macro GCM_INIT Iv SUBKEY AAD AADLEN
-	mov \AADLEN, %r11
-	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
-	xor %r11d, %r11d
-	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
-	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
-	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
-	mov \Iv, %rax
-	movdqu (%rax), %xmm0
-	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
-
-	movdqa  SHUF_MASK(%rip), %xmm2
-	pshufb %xmm2, %xmm0
-	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
-
-	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
-	movdqu HashKey(%arg2), %xmm13
-
-	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
-	%xmm4, %xmm5, %xmm6
-.endm
-
-# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
-# struct has been initialized by GCM_INIT.
-# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
-# Clobbers rax, r10-r13, and xmm0-xmm15
-.macro GCM_ENC_DEC operation
-	movdqu AadHash(%arg2), %xmm8
-	movdqu HashKey(%arg2), %xmm13
-	add %arg5, InLen(%arg2)
-
-	xor %r11d, %r11d # initialise the data pointer offset as zero
-	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
-
-	sub %r11, %arg5		# sub partial block data used
-	mov %arg5, %r13		# save the number of bytes
-
-	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
-	mov %r13, %r12
-	# Encrypt/Decrypt first few blocks
-
-	and	$(3<<4), %r12
-	jz	.L_initial_num_blocks_is_0_\@
-	cmp	$(2<<4), %r12
-	jb	.L_initial_num_blocks_is_1_\@
-	je	.L_initial_num_blocks_is_2_\@
-.L_initial_num_blocks_is_3_\@:
-	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
-	sub	$48, %r13
-	jmp	.L_initial_blocks_\@
-.L_initial_num_blocks_is_2_\@:
-	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
-	sub	$32, %r13
-	jmp	.L_initial_blocks_\@
-.L_initial_num_blocks_is_1_\@:
-	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
-	sub	$16, %r13
-	jmp	.L_initial_blocks_\@
-.L_initial_num_blocks_is_0_\@:
-	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
-.L_initial_blocks_\@:
-
-	# Main loop - Encrypt/Decrypt remaining blocks
-
-	test	%r13, %r13
-	je	.L_zero_cipher_left_\@
-	sub	$64, %r13
-	je	.L_four_cipher_left_\@
-.L_crypt_by_4_\@:
-	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
-	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
-	%xmm7, %xmm8, enc
-	add	$64, %r11
-	sub	$64, %r13
-	jne	.L_crypt_by_4_\@
-.L_four_cipher_left_\@:
-	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
-%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
-.L_zero_cipher_left_\@:
-	movdqu %xmm8, AadHash(%arg2)
-	movdqu %xmm0, CurCount(%arg2)
-
-	mov	%arg5, %r13
-	and	$15, %r13			# %r13 = arg5 (mod 16)
-	je	.L_multiple_of_16_bytes_\@
-
-	mov %r13, PBlockLen(%arg2)
-
-	# Handle the last <16 Byte block separately
-	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
-	movdqu %xmm0, CurCount(%arg2)
-	movdqa SHUF_MASK(%rip), %xmm10
-	pshufb %xmm10, %xmm0
-
-	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
-	movdqu %xmm0, PBlockEncKey(%arg2)
-
-	cmp	$16, %arg5
-	jge	.L_large_enough_update_\@
-
-	lea (%arg4,%r11,1), %r10
-	mov %r13, %r12
-	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
-	jmp	.L_data_read_\@
-
-.L_large_enough_update_\@:
-	sub	$16, %r11
-	add	%r13, %r11
-
-	# receive the last <16 Byte block
-	movdqu	(%arg4, %r11, 1), %xmm1
-
-	sub	%r13, %r11
-	add	$16, %r11
-
-	lea	SHIFT_MASK+16(%rip), %r12
-	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
-	# (r13 is the number of bytes in plaintext mod 16)
-	sub	%r13, %r12
-	# get the appropriate shuffle mask
-	movdqu	(%r12), %xmm2
-	# shift right 16-r13 bytes
-	pshufb  %xmm2, %xmm1
-
-.L_data_read_\@:
-	lea ALL_F+16(%rip), %r12
-	sub %r13, %r12
-
-.ifc \operation, dec
-	movdqa  %xmm1, %xmm2
-.endif
-	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
-	movdqu	(%r12), %xmm1
-	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
-	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
-.ifc \operation, dec
-	pand    %xmm1, %xmm2
-	movdqa SHUF_MASK(%rip), %xmm10
-	pshufb %xmm10 ,%xmm2
-
-	pxor %xmm2, %xmm8
-.else
-	movdqa SHUF_MASK(%rip), %xmm10
-	pshufb %xmm10,%xmm0
-
-	pxor	%xmm0, %xmm8
-.endif
-
-	movdqu %xmm8, AadHash(%arg2)
-.ifc \operation, enc
-	# GHASH computation for the last <16 byte block
-	movdqa SHUF_MASK(%rip), %xmm10
-	# shuffle xmm0 back to output as ciphertext
-	pshufb %xmm10, %xmm0
-.endif
-
-	# Output %r13 bytes
-	movq %xmm0, %rax
-	cmp $8, %r13
-	jle .L_less_than_8_bytes_left_\@
-	mov %rax, (%arg3 , %r11, 1)
-	add $8, %r11
-	psrldq $8, %xmm0
-	movq %xmm0, %rax
-	sub $8, %r13
-.L_less_than_8_bytes_left_\@:
-	mov %al,  (%arg3, %r11, 1)
-	add $1, %r11
-	shr $8, %rax
-	sub $1, %r13
-	jne .L_less_than_8_bytes_left_\@
-.L_multiple_of_16_bytes_\@:
-.endm
-
-# GCM_COMPLETE Finishes update of tag of last partial block
-# Output: Authorization Tag (AUTH_TAG)
-# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
-.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
-	movdqu AadHash(%arg2), %xmm8
-	movdqu HashKey(%arg2), %xmm13
-
-	mov PBlockLen(%arg2), %r12
-
-	test %r12, %r12
-	je .L_partial_done\@
-
-	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
-
-.L_partial_done\@:
-	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
-	shl	$3, %r12		  # convert into number of bits
-	movd	%r12d, %xmm15		  # len(A) in %xmm15
-	mov InLen(%arg2), %r12
-	shl     $3, %r12                  # len(C) in bits (*128)
-	movq    %r12, %xmm1
-
-	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
-	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
-	pxor	%xmm15, %xmm8
-	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
-	# final GHASH computation
-	movdqa SHUF_MASK(%rip), %xmm10
-	pshufb %xmm10, %xmm8
-
-	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
-	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
-	pxor	%xmm8, %xmm0
-.L_return_T_\@:
-	mov	\AUTHTAG, %r10                     # %r10 = authTag
-	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
-	cmp	$16, %r11
-	je	.L_T_16_\@
-	cmp	$8, %r11
-	jl	.L_T_4_\@
-.L_T_8_\@:
-	movq	%xmm0, %rax
-	mov	%rax, (%r10)
-	add	$8, %r10
-	sub	$8, %r11
-	psrldq	$8, %xmm0
-	test	%r11, %r11
-	je	.L_return_T_done_\@
-.L_T_4_\@:
-	movd	%xmm0, %eax
-	mov	%eax, (%r10)
-	add	$4, %r10
-	sub	$4, %r11
-	psrldq	$4, %xmm0
-	test	%r11, %r11
-	je	.L_return_T_done_\@
-.L_T_123_\@:
-	movd	%xmm0, %eax
-	cmp	$2, %r11
-	jl	.L_T_1_\@
-	mov	%ax, (%r10)
-	cmp	$2, %r11
-	je	.L_return_T_done_\@
-	add	$2, %r10
-	sar	$16, %eax
-.L_T_1_\@:
-	mov	%al, (%r10)
-	jmp	.L_return_T_done_\@
-.L_T_16_\@:
-	movdqu	%xmm0, (%r10)
-.L_return_T_done_\@:
-.endm
-
-#ifdef __x86_64__
-/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
-*
-*
-* Input: A and B (128-bits each, bit-reflected)
-* Output: C = A*B*x mod poly, (i.e. >>1 )
-* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
-* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
-*
-*/
-.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
-	movdqa	  \GH, \TMP1
-	pshufd	  $78, \GH, \TMP2
-	pshufd	  $78, \HK, \TMP3
-	pxor	  \GH, \TMP2            # TMP2 = a1+a0
-	pxor	  \HK, \TMP3            # TMP3 = b1+b0
-	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
-	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
-	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
-	pxor	  \GH, \TMP2
-	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
-	movdqa	  \TMP2, \TMP3
-	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
-	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
-	pxor	  \TMP3, \GH
-	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
-
-        # first phase of the reduction
-
-	movdqa    \GH, \TMP2
-	movdqa    \GH, \TMP3
-	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
-					# in in order to perform
-					# independent shifts
-	pslld     $31, \TMP2            # packed right shift <<31
-	pslld     $30, \TMP3            # packed right shift <<30
-	pslld     $25, \TMP4            # packed right shift <<25
-	pxor      \TMP3, \TMP2          # xor the shifted versions
-	pxor      \TMP4, \TMP2
-	movdqa    \TMP2, \TMP5
-	psrldq    $4, \TMP5             # right shift TMP5 1 DW
-	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
-	pxor      \TMP2, \GH
-
-        # second phase of the reduction
-
-	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
-					# in in order to perform
-					# independent shifts
-	movdqa    \GH,\TMP3
-	movdqa    \GH,\TMP4
-	psrld     $1,\TMP2              # packed left shift >>1
-	psrld     $2,\TMP3              # packed left shift >>2
-	psrld     $7,\TMP4              # packed left shift >>7
-	pxor      \TMP3,\TMP2		# xor the shifted versions
-	pxor      \TMP4,\TMP2
-	pxor      \TMP5, \TMP2
-	pxor      \TMP2, \GH
-	pxor      \TMP1, \GH            # result is in TMP1
-.endm
-
-# Reads DLEN bytes starting at DPTR and stores in XMMDst
-# where 0 < DLEN < 16
-# Clobbers %rax, DLEN and XMM1
-.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
-        cmp $8, \DLEN
-        jl .L_read_lt8_\@
-        mov (\DPTR), %rax
-        movq %rax, \XMMDst
-        sub $8, \DLEN
-        jz .L_done_read_partial_block_\@
-	xor %eax, %eax
-.L_read_next_byte_\@:
-        shl $8, %rax
-        mov 7(\DPTR, \DLEN, 1), %al
-        dec \DLEN
-        jnz .L_read_next_byte_\@
-        movq %rax, \XMM1
-	pslldq $8, \XMM1
-        por \XMM1, \XMMDst
-	jmp .L_done_read_partial_block_\@
-.L_read_lt8_\@:
-	xor %eax, %eax
-.L_read_next_byte_lt8_\@:
-        shl $8, %rax
-        mov -1(\DPTR, \DLEN, 1), %al
-        dec \DLEN
-        jnz .L_read_next_byte_lt8_\@
-        movq %rax, \XMMDst
-.L_done_read_partial_block_\@:
-.endm
-
-# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
-# clobbers r10-11, xmm14
-.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
-	TMP6 TMP7
-	MOVADQ	   SHUF_MASK(%rip), %xmm14
-	mov	   \AAD, %r10		# %r10 = AAD
-	mov	   \AADLEN, %r11		# %r11 = aadLen
-	pxor	   \TMP7, \TMP7
-	pxor	   \TMP6, \TMP6
-
-	cmp	   $16, %r11
-	jl	   .L_get_AAD_rest\@
-.L_get_AAD_blocks\@:
-	movdqu	   (%r10), \TMP7
-	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
-	pxor	   \TMP7, \TMP6
-	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
-	add	   $16, %r10
-	sub	   $16, %r11
-	cmp	   $16, %r11
-	jge	   .L_get_AAD_blocks\@
-
-	movdqu	   \TMP6, \TMP7
-
-	/* read the last <16B of AAD */
-.L_get_AAD_rest\@:
-	test	   %r11, %r11
-	je	   .L_get_AAD_done\@
-
-	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
-	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
-	pxor	   \TMP6, \TMP7
-	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
-	movdqu \TMP7, \TMP6
-
-.L_get_AAD_done\@:
-	movdqu \TMP6, AadHash(%arg2)
-.endm
-
-# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
-# between update calls.
-# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
-# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
-# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
-.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
-	AAD_HASH operation
-	mov 	PBlockLen(%arg2), %r13
-	test	%r13, %r13
-	je	.L_partial_block_done_\@	# Leave Macro if no partial blocks
-	# Read in input data without over reading
-	cmp	$16, \PLAIN_CYPH_LEN
-	jl	.L_fewer_than_16_bytes_\@
-	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
-	jmp	.L_data_read_\@
-
-.L_fewer_than_16_bytes_\@:
-	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
-	mov	\PLAIN_CYPH_LEN, %r12
-	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
-
-	mov PBlockLen(%arg2), %r13
-
-.L_data_read_\@:				# Finished reading in data
-
-	movdqu	PBlockEncKey(%arg2), %xmm9
-	movdqu	HashKey(%arg2), %xmm13
-
-	lea	SHIFT_MASK(%rip), %r12
-
-	# adjust the shuffle mask pointer to be able to shift r13 bytes
-	# r16-r13 is the number of bytes in plaintext mod 16)
-	add	%r13, %r12
-	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
-	pshufb	%xmm2, %xmm9		# shift right r13 bytes
-
-.ifc \operation, dec
-	movdqa	%xmm1, %xmm3
-	pxor	%xmm1, %xmm9		# Ciphertext XOR E(K, Yn)
-
-	mov	\PLAIN_CYPH_LEN, %r10
-	add	%r13, %r10
-	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
-	sub	$16, %r10
-	# Determine if partial block is not being filled and
-	# shift mask accordingly
-	jge	.L_no_extra_mask_1_\@
-	sub	%r10, %r12
-.L_no_extra_mask_1_\@:
-
-	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
-	# get the appropriate mask to mask out bottom r13 bytes of xmm9
-	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
-
-	pand	%xmm1, %xmm3
-	movdqa	SHUF_MASK(%rip), %xmm10
-	pshufb	%xmm10, %xmm3
-	pshufb	%xmm2, %xmm3
-	pxor	%xmm3, \AAD_HASH
-
-	test	%r10, %r10
-	jl	.L_partial_incomplete_1_\@
-
-	# GHASH computation for the last <16 Byte block
-	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-	xor	%eax, %eax
-
-	mov	%rax, PBlockLen(%arg2)
-	jmp	.L_dec_done_\@
-.L_partial_incomplete_1_\@:
-	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
-.L_dec_done_\@:
-	movdqu	\AAD_HASH, AadHash(%arg2)
-.else
-	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
-
-	mov	\PLAIN_CYPH_LEN, %r10
-	add	%r13, %r10
-	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
-	sub	$16, %r10
-	# Determine if partial block is not being filled and
-	# shift mask accordingly
-	jge	.L_no_extra_mask_2_\@
-	sub	%r10, %r12
-.L_no_extra_mask_2_\@:
-
-	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
-	# get the appropriate mask to mask out bottom r13 bytes of xmm9
-	pand	%xmm1, %xmm9
-
-	movdqa	SHUF_MASK(%rip), %xmm1
-	pshufb	%xmm1, %xmm9
-	pshufb	%xmm2, %xmm9
-	pxor	%xmm9, \AAD_HASH
-
-	test	%r10, %r10
-	jl	.L_partial_incomplete_2_\@
-
-	# GHASH computation for the last <16 Byte block
-	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-	xor	%eax, %eax
-
-	mov	%rax, PBlockLen(%arg2)
-	jmp	.L_encode_done_\@
-.L_partial_incomplete_2_\@:
-	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
-.L_encode_done_\@:
-	movdqu	\AAD_HASH, AadHash(%arg2)
-
-	movdqa	SHUF_MASK(%rip), %xmm10
-	# shuffle xmm9 back to output as ciphertext
-	pshufb	%xmm10, %xmm9
-	pshufb	%xmm2, %xmm9
-.endif
-	# output encrypted Bytes
-	test	%r10, %r10
-	jl	.L_partial_fill_\@
-	mov	%r13, %r12
-	mov	$16, %r13
-	# Set r13 to be the number of bytes to write out
-	sub	%r12, %r13
-	jmp	.L_count_set_\@
-.L_partial_fill_\@:
-	mov	\PLAIN_CYPH_LEN, %r13
-.L_count_set_\@:
-	movdqa	%xmm9, %xmm0
-	movq	%xmm0, %rax
-	cmp	$8, %r13
-	jle	.L_less_than_8_bytes_left_\@
-
-	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
-	add	$8, \DATA_OFFSET
-	psrldq	$8, %xmm0
-	movq	%xmm0, %rax
-	sub	$8, %r13
-.L_less_than_8_bytes_left_\@:
-	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
-	add	$1, \DATA_OFFSET
-	shr	$8, %rax
-	sub	$1, %r13
-	jne	.L_less_than_8_bytes_left_\@
-.L_partial_block_done_\@:
-.endm # PARTIAL_BLOCK
-
-/*
-* if a = number of total plaintext bytes
-* b = floor(a/16)
-* num_initial_blocks = b mod 4
-* encrypt the initial num_initial_blocks blocks and apply ghash on
-* the ciphertext
-* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
-* are clobbered
-* arg1, %arg2, %arg3 are used as a pointer only, not modified
-*/
-
-
-.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
-	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
-	MOVADQ		SHUF_MASK(%rip), %xmm14
-
-	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
-
-	# start AES for num_initial_blocks blocks
-
-	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
-
-.if (\i == 5) || (\i == 6) || (\i == 7)
-
-	MOVADQ		ONE(%RIP),\TMP1
-	MOVADQ		0(%arg1),\TMP2
-.irpc index, \i_seq
-	paddd		\TMP1, \XMM0                 # INCR Y0
-.ifc \operation, dec
-        movdqa     \XMM0, %xmm\index
-.else
-	MOVADQ		\XMM0, %xmm\index
-.endif
-	pshufb	%xmm14, %xmm\index      # perform a 16 byte swap
-	pxor		\TMP2, %xmm\index
-.endr
-	lea	0x10(%arg1),%r10
-	mov	keysize,%eax
-	shr	$2,%eax				# 128->4, 192->6, 256->8
-	add	$5,%eax			      # 128->9, 192->11, 256->13
-
-.Laes_loop_initial_\@:
-	MOVADQ	(%r10),\TMP1
-.irpc	index, \i_seq
-	aesenc	\TMP1, %xmm\index
-.endr
-	add	$16,%r10
-	sub	$1,%eax
-	jnz	.Laes_loop_initial_\@
-
-	MOVADQ	(%r10), \TMP1
-.irpc index, \i_seq
-	aesenclast \TMP1, %xmm\index         # Last Round
-.endr
-.irpc index, \i_seq
-	movdqu	   (%arg4 , %r11, 1), \TMP1
-	pxor	   \TMP1, %xmm\index
-	movdqu	   %xmm\index, (%arg3 , %r11, 1)
-	# write back plaintext/ciphertext for num_initial_blocks
-	add	   $16, %r11
-
-.ifc \operation, dec
-	movdqa     \TMP1, %xmm\index
-.endif
-	pshufb	   %xmm14, %xmm\index
-
-		# prepare plaintext/ciphertext for GHASH computation
-.endr
-.endif
-
-        # apply GHASH on num_initial_blocks blocks
-
-.if \i == 5
-        pxor       %xmm5, %xmm6
-	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-        pxor       %xmm6, %xmm7
-	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-        pxor       %xmm7, %xmm8
-	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.elseif \i == 6
-        pxor       %xmm6, %xmm7
-	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-        pxor       %xmm7, %xmm8
-	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.elseif \i == 7
-        pxor       %xmm7, %xmm8
-	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.endif
-	cmp	   $64, %r13
-	jl	.L_initial_blocks_done\@
-	# no need for precomputed values
-/*
-*
-* Precomputations for HashKey parallel with encryption of first 4 blocks.
-* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-*/
-	MOVADQ	   ONE(%RIP),\TMP1
-	paddd	   \TMP1, \XMM0              # INCR Y0
-	MOVADQ	   \XMM0, \XMM1
-	pshufb  %xmm14, \XMM1        # perform a 16 byte swap
-
-	paddd	   \TMP1, \XMM0              # INCR Y0
-	MOVADQ	   \XMM0, \XMM2
-	pshufb  %xmm14, \XMM2        # perform a 16 byte swap
-
-	paddd	   \TMP1, \XMM0              # INCR Y0
-	MOVADQ	   \XMM0, \XMM3
-	pshufb %xmm14, \XMM3        # perform a 16 byte swap
-
-	paddd	   \TMP1, \XMM0              # INCR Y0
-	MOVADQ	   \XMM0, \XMM4
-	pshufb %xmm14, \XMM4        # perform a 16 byte swap
-
-	MOVADQ	   0(%arg1),\TMP1
-	pxor	   \TMP1, \XMM1
-	pxor	   \TMP1, \XMM2
-	pxor	   \TMP1, \XMM3
-	pxor	   \TMP1, \XMM4
-.irpc index, 1234 # do 4 rounds
-	movaps 0x10*\index(%arg1), \TMP1
-	aesenc	   \TMP1, \XMM1
-	aesenc	   \TMP1, \XMM2
-	aesenc	   \TMP1, \XMM3
-	aesenc	   \TMP1, \XMM4
-.endr
-.irpc index, 56789 # do next 5 rounds
-	movaps 0x10*\index(%arg1), \TMP1
-	aesenc	   \TMP1, \XMM1
-	aesenc	   \TMP1, \XMM2
-	aesenc	   \TMP1, \XMM3
-	aesenc	   \TMP1, \XMM4
-.endr
-	lea	   0xa0(%arg1),%r10
-	mov	   keysize,%eax
-	shr	   $2,%eax			# 128->4, 192->6, 256->8
-	sub	   $4,%eax			# 128->0, 192->2, 256->4
-	jz	   .Laes_loop_pre_done\@
-
-.Laes_loop_pre_\@:
-	MOVADQ	   (%r10),\TMP2
-.irpc	index, 1234
-	aesenc	   \TMP2, %xmm\index
-.endr
-	add	   $16,%r10
-	sub	   $1,%eax
-	jnz	   .Laes_loop_pre_\@
-
-.Laes_loop_pre_done\@:
-	MOVADQ	   (%r10), \TMP2
-	aesenclast \TMP2, \XMM1
-	aesenclast \TMP2, \XMM2
-	aesenclast \TMP2, \XMM3
-	aesenclast \TMP2, \XMM4
-	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
-	pxor	   \TMP1, \XMM1
-.ifc \operation, dec
-	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
-	movdqa     \TMP1, \XMM1
-.endif
-	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
-	pxor	   \TMP1, \XMM2
-.ifc \operation, dec
-	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
-	movdqa     \TMP1, \XMM2
-.endif
-	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
-	pxor	   \TMP1, \XMM3
-.ifc \operation, dec
-	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
-	movdqa     \TMP1, \XMM3
-.endif
-	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
-	pxor	   \TMP1, \XMM4
-.ifc \operation, dec
-	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
-	movdqa     \TMP1, \XMM4
-.else
-	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
-	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
-	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
-	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
-.endif
-
-	add	   $64, %r11
-	pshufb %xmm14, \XMM1 # perform a 16 byte swap
-	pxor	   \XMMDst, \XMM1
-# combine GHASHed value with the corresponding ciphertext
-	pshufb %xmm14, \XMM2 # perform a 16 byte swap
-	pshufb %xmm14, \XMM3 # perform a 16 byte swap
-	pshufb %xmm14, \XMM4 # perform a 16 byte swap
-
-.L_initial_blocks_done\@:
-
-.endm
-
-/*
-* encrypt 4 blocks at a time
-* ghash the 4 previously encrypted ciphertext blocks
-* arg1, %arg3, %arg4 are used as pointers only, not modified
-* %r11 is the data offset value
-*/
-.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
-TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
-
-	movdqa	  \XMM1, \XMM5
-	movdqa	  \XMM2, \XMM6
-	movdqa	  \XMM3, \XMM7
-	movdqa	  \XMM4, \XMM8
-
-        movdqa    SHUF_MASK(%rip), %xmm15
-        # multiply TMP5 * HashKey using karatsuba
-
-	movdqa	  \XMM5, \TMP4
-	pshufd	  $78, \XMM5, \TMP6
-	pxor	  \XMM5, \TMP6
-	paddd     ONE(%rip), \XMM0		# INCR CNT
-	movdqu	  HashKey_4(%arg2), \TMP5
-	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
-	movdqa    \XMM0, \XMM1
-	paddd     ONE(%rip), \XMM0		# INCR CNT
-	movdqa    \XMM0, \XMM2
-	paddd     ONE(%rip), \XMM0		# INCR CNT
-	movdqa    \XMM0, \XMM3
-	paddd     ONE(%rip), \XMM0		# INCR CNT
-	movdqa    \XMM0, \XMM4
-	pshufb %xmm15, \XMM1	# perform a 16 byte swap
-	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
-	pshufb %xmm15, \XMM2	# perform a 16 byte swap
-	pshufb %xmm15, \XMM3	# perform a 16 byte swap
-	pshufb %xmm15, \XMM4	# perform a 16 byte swap
-
-	pxor	  (%arg1), \XMM1
-	pxor	  (%arg1), \XMM2
-	pxor	  (%arg1), \XMM3
-	pxor	  (%arg1), \XMM4
-	movdqu	  HashKey_4_k(%arg2), \TMP5
-	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
-	movaps 0x10(%arg1), \TMP1
-	aesenc	  \TMP1, \XMM1              # Round 1
-	aesenc	  \TMP1, \XMM2
-	aesenc	  \TMP1, \XMM3
-	aesenc	  \TMP1, \XMM4
-	movaps 0x20(%arg1), \TMP1
-	aesenc	  \TMP1, \XMM1              # Round 2
-	aesenc	  \TMP1, \XMM2
-	aesenc	  \TMP1, \XMM3
-	aesenc	  \TMP1, \XMM4
-	movdqa	  \XMM6, \TMP1
-	pshufd	  $78, \XMM6, \TMP2
-	pxor	  \XMM6, \TMP2
-	movdqu	  HashKey_3(%arg2), \TMP5
-	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
-	movaps 0x30(%arg1), \TMP3
-	aesenc    \TMP3, \XMM1              # Round 3
-	aesenc    \TMP3, \XMM2
-	aesenc    \TMP3, \XMM3
-	aesenc    \TMP3, \XMM4
-	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
-	movaps 0x40(%arg1), \TMP3
-	aesenc	  \TMP3, \XMM1              # Round 4
-	aesenc	  \TMP3, \XMM2
-	aesenc	  \TMP3, \XMM3
-	aesenc	  \TMP3, \XMM4
-	movdqu	  HashKey_3_k(%arg2), \TMP5
-	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
-	movaps 0x50(%arg1), \TMP3
-	aesenc	  \TMP3, \XMM1              # Round 5
-	aesenc	  \TMP3, \XMM2
-	aesenc	  \TMP3, \XMM3
-	aesenc	  \TMP3, \XMM4
-	pxor	  \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
-	pxor	  \XMM6, \XMM5
-	pxor	  \TMP2, \TMP6
-	movdqa	  \XMM7, \TMP1
-	pshufd	  $78, \XMM7, \TMP2
-	pxor	  \XMM7, \TMP2
-	movdqu	  HashKey_2(%arg2), \TMP5
-
-        # Multiply TMP5 * HashKey using karatsuba
-
-	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
-	movaps 0x60(%arg1), \TMP3
-	aesenc	  \TMP3, \XMM1              # Round 6
-	aesenc	  \TMP3, \XMM2
-	aesenc	  \TMP3, \XMM3
-	aesenc	  \TMP3, \XMM4
-	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
-	movaps 0x70(%arg1), \TMP3
-	aesenc	  \TMP3, \XMM1              # Round 7
-	aesenc	  \TMP3, \XMM2
-	aesenc	  \TMP3, \XMM3
-	aesenc	  \TMP3, \XMM4
-	movdqu	  HashKey_2_k(%arg2), \TMP5
-	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
-	movaps 0x80(%arg1), \TMP3
-	aesenc	  \TMP3, \XMM1              # Round 8
-	aesenc	  \TMP3, \XMM2
-	aesenc	  \TMP3, \XMM3
-	aesenc	  \TMP3, \XMM4
-	pxor	  \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
-	pxor	  \XMM7, \XMM5
-	pxor	  \TMP2, \TMP6
-
-        # Multiply XMM8 * HashKey
-        # XMM8 and TMP5 hold the values for the two operands
-
-	movdqa	  \XMM8, \TMP1
-	pshufd	  $78, \XMM8, \TMP2
-	pxor	  \XMM8, \TMP2
-	movdqu	  HashKey(%arg2), \TMP5
-	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
-	movaps 0x90(%arg1), \TMP3
-	aesenc	  \TMP3, \XMM1             # Round 9
-	aesenc	  \TMP3, \XMM2
-	aesenc	  \TMP3, \XMM3
-	aesenc	  \TMP3, \XMM4
-	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
-	lea	  0xa0(%arg1),%r10
-	mov	  keysize,%eax
-	shr	  $2,%eax			# 128->4, 192->6, 256->8
-	sub	  $4,%eax			# 128->0, 192->2, 256->4
-	jz	  .Laes_loop_par_enc_done\@
-
-.Laes_loop_par_enc\@:
-	MOVADQ	  (%r10),\TMP3
-.irpc	index, 1234
-	aesenc	  \TMP3, %xmm\index
-.endr
-	add	  $16,%r10
-	sub	  $1,%eax
-	jnz	  .Laes_loop_par_enc\@
-
-.Laes_loop_par_enc_done\@:
-	MOVADQ	  (%r10), \TMP3
-	aesenclast \TMP3, \XMM1           # Round 10
-	aesenclast \TMP3, \XMM2
-	aesenclast \TMP3, \XMM3
-	aesenclast \TMP3, \XMM4
-	movdqu    HashKey_k(%arg2), \TMP5
-	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
-	movdqu	  (%arg4,%r11,1), \TMP3
-	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
-	movdqu	  16(%arg4,%r11,1), \TMP3
-	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
-	movdqu	  32(%arg4,%r11,1), \TMP3
-	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
-	movdqu	  48(%arg4,%r11,1), \TMP3
-	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
-        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
-        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
-        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
-        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
-	pshufb %xmm15, \XMM1        # perform a 16 byte swap
-	pshufb %xmm15, \XMM2	# perform a 16 byte swap
-	pshufb %xmm15, \XMM3	# perform a 16 byte swap
-	pshufb %xmm15, \XMM4	# perform a 16 byte swap
-
-	pxor	  \TMP4, \TMP1
-	pxor	  \XMM8, \XMM5
-	pxor	  \TMP6, \TMP2
-	pxor	  \TMP1, \TMP2
-	pxor	  \XMM5, \TMP2
-	movdqa	  \TMP2, \TMP3
-	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
-	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
-	pxor	  \TMP3, \XMM5
-	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
-
-        # first phase of reduction
-
-	movdqa    \XMM5, \TMP2
-	movdqa    \XMM5, \TMP3
-	movdqa    \XMM5, \TMP4
-# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
-	pslld     $31, \TMP2                   # packed right shift << 31
-	pslld     $30, \TMP3                   # packed right shift << 30
-	pslld     $25, \TMP4                   # packed right shift << 25
-	pxor      \TMP3, \TMP2	               # xor the shifted versions
-	pxor      \TMP4, \TMP2
-	movdqa    \TMP2, \TMP5
-	psrldq    $4, \TMP5                    # right shift T5 1 DW
-	pslldq    $12, \TMP2                   # left shift T2 3 DWs
-	pxor      \TMP2, \XMM5
-
-        # second phase of reduction
-
-	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
-	movdqa    \XMM5,\TMP3
-	movdqa    \XMM5,\TMP4
-	psrld     $1, \TMP2                    # packed left shift >>1
-	psrld     $2, \TMP3                    # packed left shift >>2
-	psrld     $7, \TMP4                    # packed left shift >>7
-	pxor      \TMP3,\TMP2		       # xor the shifted versions
-	pxor      \TMP4,\TMP2
-	pxor      \TMP5, \TMP2
-	pxor      \TMP2, \XMM5
-	pxor      \TMP1, \XMM5                 # result is in TMP1
-
-	pxor	  \XMM5, \XMM1
-.endm
-
-/*
-* decrypt 4 blocks at a time
-* ghash the 4 previously decrypted ciphertext blocks
-* arg1, %arg3, %arg4 are used as pointers only, not modified
-* %r11 is the data offset value
-*/
-.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
-TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
-
-	movdqa	  \XMM1, \XMM5
-	movdqa	  \XMM2, \XMM6
-	movdqa	  \XMM3, \XMM7
-	movdqa	  \XMM4, \XMM8
-
-        movdqa    SHUF_MASK(%rip), %xmm15
-        # multiply TMP5 * HashKey using karatsuba
-
-	movdqa	  \XMM5, \TMP4
-	pshufd	  $78, \XMM5, \TMP6
-	pxor	  \XMM5, \TMP6
-	paddd     ONE(%rip), \XMM0		# INCR CNT
-	movdqu	  HashKey_4(%arg2), \TMP5
-	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
-	movdqa    \XMM0, \XMM1
-	paddd     ONE(%rip), \XMM0		# INCR CNT
-	movdqa    \XMM0, \XMM2
-	paddd     ONE(%rip), \XMM0		# INCR CNT
-	movdqa    \XMM0, \XMM3
-	paddd     ONE(%rip), \XMM0		# INCR CNT
-	movdqa    \XMM0, \XMM4
-	pshufb %xmm15, \XMM1	# perform a 16 byte swap
-	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
-	pshufb %xmm15, \XMM2	# perform a 16 byte swap
-	pshufb %xmm15, \XMM3	# perform a 16 byte swap
-	pshufb %xmm15, \XMM4	# perform a 16 byte swap
-
-	pxor	  (%arg1), \XMM1
-	pxor	  (%arg1), \XMM2
-	pxor	  (%arg1), \XMM3
-	pxor	  (%arg1), \XMM4
-	movdqu	  HashKey_4_k(%arg2), \TMP5
-	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
-	movaps 0x10(%arg1), \TMP1
-	aesenc	  \TMP1, \XMM1              # Round 1
-	aesenc	  \TMP1, \XMM2
-	aesenc	  \TMP1, \XMM3
-	aesenc	  \TMP1, \XMM4
-	movaps 0x20(%arg1), \TMP1
-	aesenc	  \TMP1, \XMM1              # Round 2
-	aesenc	  \TMP1, \XMM2
-	aesenc	  \TMP1, \XMM3
-	aesenc	  \TMP1, \XMM4
-	movdqa	  \XMM6, \TMP1
-	pshufd	  $78, \XMM6, \TMP2
-	pxor	  \XMM6, \TMP2
-	movdqu	  HashKey_3(%arg2), \TMP5
-	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
-	movaps 0x30(%arg1), \TMP3
-	aesenc    \TMP3, \XMM1              # Round 3
-	aesenc    \TMP3, \XMM2
-	aesenc    \TMP3, \XMM3
-	aesenc    \TMP3, \XMM4
-	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
-	movaps 0x40(%arg1), \TMP3
-	aesenc	  \TMP3, \XMM1              # Round 4
-	aesenc	  \TMP3, \XMM2
-	aesenc	  \TMP3, \XMM3
-	aesenc	  \TMP3, \XMM4
-	movdqu	  HashKey_3_k(%arg2), \TMP5
-	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
-	movaps 0x50(%arg1), \TMP3
-	aesenc	  \TMP3, \XMM1              # Round 5
-	aesenc	  \TMP3, \XMM2
-	aesenc	  \TMP3, \XMM3
-	aesenc	  \TMP3, \XMM4
-	pxor	  \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
-	pxor	  \XMM6, \XMM5
-	pxor	  \TMP2, \TMP6
-	movdqa	  \XMM7, \TMP1
-	pshufd	  $78, \XMM7, \TMP2
-	pxor	  \XMM7, \TMP2
-	movdqu	  HashKey_2(%arg2), \TMP5
-
-        # Multiply TMP5 * HashKey using karatsuba
-
-	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
-	movaps 0x60(%arg1), \TMP3
-	aesenc	  \TMP3, \XMM1              # Round 6
-	aesenc	  \TMP3, \XMM2
-	aesenc	  \TMP3, \XMM3
-	aesenc	  \TMP3, \XMM4
-	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
-	movaps 0x70(%arg1), \TMP3
-	aesenc	  \TMP3, \XMM1              # Round 7
-	aesenc	  \TMP3, \XMM2
-	aesenc	  \TMP3, \XMM3
-	aesenc	  \TMP3, \XMM4
-	movdqu	  HashKey_2_k(%arg2), \TMP5
-	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
-	movaps 0x80(%arg1), \TMP3
-	aesenc	  \TMP3, \XMM1              # Round 8
-	aesenc	  \TMP3, \XMM2
-	aesenc	  \TMP3, \XMM3
-	aesenc	  \TMP3, \XMM4
-	pxor	  \TMP1, \TMP4
-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
-	pxor	  \XMM7, \XMM5
-	pxor	  \TMP2, \TMP6
-
-        # Multiply XMM8 * HashKey
-        # XMM8 and TMP5 hold the values for the two operands
-
-	movdqa	  \XMM8, \TMP1
-	pshufd	  $78, \XMM8, \TMP2
-	pxor	  \XMM8, \TMP2
-	movdqu	  HashKey(%arg2), \TMP5
-	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
-	movaps 0x90(%arg1), \TMP3
-	aesenc	  \TMP3, \XMM1             # Round 9
-	aesenc	  \TMP3, \XMM2
-	aesenc	  \TMP3, \XMM3
-	aesenc	  \TMP3, \XMM4
-	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
-	lea	  0xa0(%arg1),%r10
-	mov	  keysize,%eax
-	shr	  $2,%eax		        # 128->4, 192->6, 256->8
-	sub	  $4,%eax			# 128->0, 192->2, 256->4
-	jz	  .Laes_loop_par_dec_done\@
-
-.Laes_loop_par_dec\@:
-	MOVADQ	  (%r10),\TMP3
-.irpc	index, 1234
-	aesenc	  \TMP3, %xmm\index
-.endr
-	add	  $16,%r10
-	sub	  $1,%eax
-	jnz	  .Laes_loop_par_dec\@
-
-.Laes_loop_par_dec_done\@:
-	MOVADQ	  (%r10), \TMP3
-	aesenclast \TMP3, \XMM1           # last round
-	aesenclast \TMP3, \XMM2
-	aesenclast \TMP3, \XMM3
-	aesenclast \TMP3, \XMM4
-	movdqu    HashKey_k(%arg2), \TMP5
-	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
-	movdqu	  (%arg4,%r11,1), \TMP3
-	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
-	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
-	movdqa    \TMP3, \XMM1
-	movdqu	  16(%arg4,%r11,1), \TMP3
-	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
-	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
-	movdqa    \TMP3, \XMM2
-	movdqu	  32(%arg4,%r11,1), \TMP3
-	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
-	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
-	movdqa    \TMP3, \XMM3
-	movdqu	  48(%arg4,%r11,1), \TMP3
-	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
-	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
-	movdqa    \TMP3, \XMM4
-	pshufb %xmm15, \XMM1        # perform a 16 byte swap
-	pshufb %xmm15, \XMM2	# perform a 16 byte swap
-	pshufb %xmm15, \XMM3	# perform a 16 byte swap
-	pshufb %xmm15, \XMM4	# perform a 16 byte swap
-
-	pxor	  \TMP4, \TMP1
-	pxor	  \XMM8, \XMM5
-	pxor	  \TMP6, \TMP2
-	pxor	  \TMP1, \TMP2
-	pxor	  \XMM5, \TMP2
-	movdqa	  \TMP2, \TMP3
-	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
-	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
-	pxor	  \TMP3, \XMM5
-	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
-
-        # first phase of reduction
-
-	movdqa    \XMM5, \TMP2
-	movdqa    \XMM5, \TMP3
-	movdqa    \XMM5, \TMP4
-# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
-	pslld     $31, \TMP2                   # packed right shift << 31
-	pslld     $30, \TMP3                   # packed right shift << 30
-	pslld     $25, \TMP4                   # packed right shift << 25
-	pxor      \TMP3, \TMP2	               # xor the shifted versions
-	pxor      \TMP4, \TMP2
-	movdqa    \TMP2, \TMP5
-	psrldq    $4, \TMP5                    # right shift T5 1 DW
-	pslldq    $12, \TMP2                   # left shift T2 3 DWs
-	pxor      \TMP2, \XMM5
-
-        # second phase of reduction
-
-	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
-	movdqa    \XMM5,\TMP3
-	movdqa    \XMM5,\TMP4
-	psrld     $1, \TMP2                    # packed left shift >>1
-	psrld     $2, \TMP3                    # packed left shift >>2
-	psrld     $7, \TMP4                    # packed left shift >>7
-	pxor      \TMP3,\TMP2		       # xor the shifted versions
-	pxor      \TMP4,\TMP2
-	pxor      \TMP5, \TMP2
-	pxor      \TMP2, \XMM5
-	pxor      \TMP1, \XMM5                 # result is in TMP1
-
-	pxor	  \XMM5, \XMM1
-.endm
-
-/* GHASH the last 4 ciphertext blocks. */
-.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
-TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
-
-        # Multiply TMP6 * HashKey (using Karatsuba)
-
-	movdqa	  \XMM1, \TMP6
-	pshufd	  $78, \XMM1, \TMP2
-	pxor	  \XMM1, \TMP2
-	movdqu	  HashKey_4(%arg2), \TMP5
-	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
-	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
-	movdqu	  HashKey_4_k(%arg2), \TMP4
-	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
-	movdqa	  \XMM1, \XMMDst
-	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
-
-        # Multiply TMP1 * HashKey (using Karatsuba)
-
-	movdqa	  \XMM2, \TMP1
-	pshufd	  $78, \XMM2, \TMP2
-	pxor	  \XMM2, \TMP2
-	movdqu	  HashKey_3(%arg2), \TMP5
-	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
-	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
-	movdqu	  HashKey_3_k(%arg2), \TMP4
-	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
-	pxor	  \TMP1, \TMP6
-	pxor	  \XMM2, \XMMDst
-	pxor	  \TMP2, \XMM1
-# results accumulated in TMP6, XMMDst, XMM1
-
-        # Multiply TMP1 * HashKey (using Karatsuba)
-
-	movdqa	  \XMM3, \TMP1
-	pshufd	  $78, \XMM3, \TMP2
-	pxor	  \XMM3, \TMP2
-	movdqu	  HashKey_2(%arg2), \TMP5
-	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
-	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
-	movdqu	  HashKey_2_k(%arg2), \TMP4
-	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
-	pxor	  \TMP1, \TMP6
-	pxor	  \XMM3, \XMMDst
-	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
-
-        # Multiply TMP1 * HashKey (using Karatsuba)
-	movdqa	  \XMM4, \TMP1
-	pshufd	  $78, \XMM4, \TMP2
-	pxor	  \XMM4, \TMP2
-	movdqu	  HashKey(%arg2), \TMP5
-	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
-	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
-	movdqu	  HashKey_k(%arg2), \TMP4
-	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
-	pxor	  \TMP1, \TMP6
-	pxor	  \XMM4, \XMMDst
-	pxor	  \XMM1, \TMP2
-	pxor	  \TMP6, \TMP2
-	pxor	  \XMMDst, \TMP2
-	# middle section of the temp results combined as in karatsuba algorithm
-	movdqa	  \TMP2, \TMP4
-	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
-	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
-	pxor	  \TMP4, \XMMDst
-	pxor	  \TMP2, \TMP6
-# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
-	# first phase of the reduction
-	movdqa    \XMMDst, \TMP2
-	movdqa    \XMMDst, \TMP3
-	movdqa    \XMMDst, \TMP4
-# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
-	pslld     $31, \TMP2                # packed right shifting << 31
-	pslld     $30, \TMP3                # packed right shifting << 30
-	pslld     $25, \TMP4                # packed right shifting << 25
-	pxor      \TMP3, \TMP2              # xor the shifted versions
-	pxor      \TMP4, \TMP2
-	movdqa    \TMP2, \TMP7
-	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
-	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
-	pxor      \TMP2, \XMMDst
-
-        # second phase of the reduction
-	movdqa    \XMMDst, \TMP2
-	# make 3 copies of XMMDst for doing 3 shift operations
-	movdqa    \XMMDst, \TMP3
-	movdqa    \XMMDst, \TMP4
-	psrld     $1, \TMP2                 # packed left shift >> 1
-	psrld     $2, \TMP3                 # packed left shift >> 2
-	psrld     $7, \TMP4                 # packed left shift >> 7
-	pxor      \TMP3, \TMP2              # xor the shifted versions
-	pxor      \TMP4, \TMP2
-	pxor      \TMP7, \TMP2
-	pxor      \TMP2, \XMMDst
-	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
-.endm
-
-
-/* Encryption of a single block
-* uses eax & r10
-*/
-
-.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
-
-	pxor		(%arg1), \XMM0
-	mov		keysize,%eax
-	shr		$2,%eax			# 128->4, 192->6, 256->8
-	add		$5,%eax			# 128->9, 192->11, 256->13
-	lea		16(%arg1), %r10	  # get first expanded key address
-
-_esb_loop_\@:
-	MOVADQ		(%r10),\TMP1
-	aesenc		\TMP1,\XMM0
-	add		$16,%r10
-	sub		$1,%eax
-	jnz		_esb_loop_\@
-
-	MOVADQ		(%r10),\TMP1
-	aesenclast	\TMP1,\XMM0
-.endm
-
-/*****************************************************************************
-* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
-*                     struct gcm_context_data *data,
-*                                         // context data
-*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
-*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
-*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
-*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
-*                     const u8 *aad,      // Additional Authentication Data (AAD)
-*                     u64 aad_len)        // Length of AAD in bytes.
-*/
-SYM_FUNC_START(aesni_gcm_init)
-	FUNC_SAVE
-	GCM_INIT %arg3, %arg4,%arg5, %arg6
-	FUNC_RESTORE
-	RET
-SYM_FUNC_END(aesni_gcm_init)
-
-/*****************************************************************************
-* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
-*                    struct gcm_context_data *data,
-*                                        // context data
-*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
-*                    const u8 *in,       // Plaintext input
-*                    u64 plaintext_len,  // Length of data in bytes for encryption.
-*/
-SYM_FUNC_START(aesni_gcm_enc_update)
-	FUNC_SAVE
-	GCM_ENC_DEC enc
-	FUNC_RESTORE
-	RET
-SYM_FUNC_END(aesni_gcm_enc_update)
-
-/*****************************************************************************
-* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
-*                    struct gcm_context_data *data,
-*                                        // context data
-*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
-*                    const u8 *in,       // Plaintext input
-*                    u64 plaintext_len,  // Length of data in bytes for encryption.
-*/
-SYM_FUNC_START(aesni_gcm_dec_update)
-	FUNC_SAVE
-	GCM_ENC_DEC dec
-	FUNC_RESTORE
-	RET
-SYM_FUNC_END(aesni_gcm_dec_update)
-
-/*****************************************************************************
-* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
-*                    struct gcm_context_data *data,
-*                                        // context data
-*                    u8 *auth_tag,       // Authenticated Tag output.
-*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
-*                                        // 12 or 8.
-*/
-SYM_FUNC_START(aesni_gcm_finalize)
-	FUNC_SAVE
-	GCM_COMPLETE %arg3 %arg4
-	FUNC_RESTORE
-	RET
-SYM_FUNC_END(aesni_gcm_finalize)
-
-#endif
-
 SYM_FUNC_START_LOCAL(_key_expansion_256a)
 	pshufd $0b11111111, %xmm1, %xmm1
 	shufps $0b00010000, %xmm0, %xmm4
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
deleted file mode 100644
index 8c9749ed0651..000000000000
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ /dev/null
@@ -1,2804 +0,0 @@
-########################################################################
-# Copyright (c) 2013, Intel Corporation
-#
-# This software is available to you under a choice of one of two
-# licenses.  You may choose to be licensed under the terms of the GNU
-# General Public License (GPL) Version 2, available from the file
-# COPYING in the main directory of this source tree, or the
-# OpenIB.org BSD license below:
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-# * Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-#
-# * Neither the name of the Intel Corporation nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-#
-# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
-# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-########################################################################
-##
-## Authors:
-##	Erdinc Ozturk <erdinc.ozturk@intel.com>
-##	Vinodh Gopal <vinodh.gopal@intel.com>
-##	James Guilford <james.guilford@intel.com>
-##	Tim Chen <tim.c.chen@linux.intel.com>
-##
-## References:
-##       This code was derived and highly optimized from the code described in paper:
-##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
-##			on Intel Architecture Processors. August, 2010
-##       The details of the implementation is explained in:
-##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
-##			on Intel Architecture Processors. October, 2012.
-##
-## Assumptions:
-##
-##
-##
-## iv:
-##       0                   1                   2                   3
-##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##       |                             Salt  (From the SA)               |
-##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##       |                     Initialization Vector                     |
-##       |         (This is the sequence number from IPSec header)       |
-##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##       |                              0x1                              |
-##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##
-##
-##
-## AAD:
-##       AAD padded to 128 bits with 0
-##       for example, assume AAD is a u32 vector
-##
-##       if AAD is 8 bytes:
-##       AAD[3] = {A0, A1}#
-##       padded AAD in xmm register = {A1 A0 0 0}
-##
-##       0                   1                   2                   3
-##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##       |                               SPI (A1)                        |
-##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##       |                     32-bit Sequence Number (A0)               |
-##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##       |                              0x0                              |
-##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##
-##                                       AAD Format with 32-bit Sequence Number
-##
-##       if AAD is 12 bytes:
-##       AAD[3] = {A0, A1, A2}#
-##       padded AAD in xmm register = {A2 A1 A0 0}
-##
-##       0                   1                   2                   3
-##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
-##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##       |                               SPI (A2)                        |
-##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##       |                 64-bit Extended Sequence Number {A1,A0}       |
-##       |                                                               |
-##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##       |                              0x0                              |
-##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
-##
-##        AAD Format with 64-bit Extended Sequence Number
-##
-##
-## aadLen:
-##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
-##	 The code additionally supports aadLen of length 16 bytes.
-##
-## TLen:
-##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
-##
-## poly = x^128 + x^127 + x^126 + x^121 + 1
-## throughout the code, one tab and two tab indentations are used. one tab is
-## for GHASH part, two tabs is for AES part.
-##
-
-#include <linux/linkage.h>
-
-# constants in mergeable sections, linker can reorder and merge
-.section	.rodata.cst16.POLY, "aM", @progbits, 16
-.align 16
-POLY:            .octa     0xC2000000000000000000000000000001
-
-.section	.rodata.cst16.POLY2, "aM", @progbits, 16
-.align 16
-POLY2:           .octa     0xC20000000000000000000001C2000000
-
-.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
-.align 16
-TWOONE:          .octa     0x00000001000000000000000000000001
-
-.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
-.align 16
-SHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
-
-.section	.rodata.cst16.ONE, "aM", @progbits, 16
-.align 16
-ONE:             .octa     0x00000000000000000000000000000001
-
-.section	.rodata.cst16.ONEf, "aM", @progbits, 16
-.align 16
-ONEf:            .octa     0x01000000000000000000000000000000
-
-# order of these constants should not change.
-# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
-.section	.rodata, "a", @progbits
-.align 16
-SHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
-ALL_F:           .octa     0xffffffffffffffffffffffffffffffff
-                 .octa     0x00000000000000000000000000000000
-
-.text
-
-
-#define AadHash 16*0
-#define AadLen 16*1
-#define InLen (16*1)+8
-#define PBlockEncKey 16*2
-#define OrigIV 16*3
-#define CurCount 16*4
-#define PBlockLen 16*5
-
-HashKey        = 16*6   # store HashKey <<1 mod poly here
-HashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
-HashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
-HashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
-HashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
-HashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
-HashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
-HashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
-HashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
-HashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
-HashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
-HashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
-HashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
-HashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
-HashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
-HashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
-
-#define arg1 %rdi
-#define arg2 %rsi
-#define arg3 %rdx
-#define arg4 %rcx
-#define arg5 %r8
-#define arg6 %r9
-#define keysize 2*15*16(arg1)
-
-i = 0
-j = 0
-
-out_order = 0
-in_order = 1
-DEC = 0
-ENC = 1
-
-.macro define_reg r n
-reg_\r = %xmm\n
-.endm
-
-.macro setreg
-.altmacro
-define_reg i %i
-define_reg j %j
-.noaltmacro
-.endm
-
-TMP1 =   16*0    # Temporary storage for AAD
-TMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
-TMP3 =   16*2    # Temporary storage for AES State 3
-TMP4 =   16*3    # Temporary storage for AES State 4
-TMP5 =   16*4    # Temporary storage for AES State 5
-TMP6 =   16*5    # Temporary storage for AES State 6
-TMP7 =   16*6    # Temporary storage for AES State 7
-TMP8 =   16*7    # Temporary storage for AES State 8
-
-VARIABLE_OFFSET = 16*8
-
-################################
-# Utility Macros
-################################
-
-.macro FUNC_SAVE
-        push    %r12
-        push    %r13
-        push    %r15
-
-	push	%rbp
-	mov	%rsp, %rbp
-
-        sub     $VARIABLE_OFFSET, %rsp
-        and     $~63, %rsp                    # align rsp to 64 bytes
-.endm
-
-.macro FUNC_RESTORE
-        mov     %rbp, %rsp
-	pop	%rbp
-
-        pop     %r15
-        pop     %r13
-        pop     %r12
-.endm
-
-# Encryption of a single block
-.macro ENCRYPT_SINGLE_BLOCK REP XMM0
-                vpxor    (arg1), \XMM0, \XMM0
-               i = 1
-               setreg
-.rep \REP
-                vaesenc  16*i(arg1), \XMM0, \XMM0
-               i = (i+1)
-               setreg
-.endr
-                vaesenclast 16*i(arg1), \XMM0, \XMM0
-.endm
-
-# combined for GCM encrypt and decrypt functions
-# clobbering all xmm registers
-# clobbering r10, r11, r12, r13, r15, rax
-.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
-        vmovdqu AadHash(arg2), %xmm8
-        vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
-        add arg5, InLen(arg2)
-
-        # initialize the data pointer offset as zero
-        xor     %r11d, %r11d
-
-        PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
-        sub %r11, arg5
-
-        mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
-        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
-
-        mov     %r13, %r12
-        shr     $4, %r12
-        and     $7, %r12
-        jz      .L_initial_num_blocks_is_0\@
-
-        cmp     $7, %r12
-        je      .L_initial_num_blocks_is_7\@
-        cmp     $6, %r12
-        je      .L_initial_num_blocks_is_6\@
-        cmp     $5, %r12
-        je      .L_initial_num_blocks_is_5\@
-        cmp     $4, %r12
-        je      .L_initial_num_blocks_is_4\@
-        cmp     $3, %r12
-        je      .L_initial_num_blocks_is_3\@
-        cmp     $2, %r12
-        je      .L_initial_num_blocks_is_2\@
-
-        jmp     .L_initial_num_blocks_is_1\@
-
-.L_initial_num_blocks_is_7\@:
-        \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*7, %r13
-        jmp     .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_6\@:
-        \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*6, %r13
-        jmp     .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_5\@:
-        \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*5, %r13
-        jmp     .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_4\@:
-        \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*4, %r13
-        jmp     .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_3\@:
-        \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*3, %r13
-        jmp     .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_2\@:
-        \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*2, %r13
-        jmp     .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_1\@:
-        \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-        sub     $16*1, %r13
-        jmp     .L_initial_blocks_encrypted\@
-
-.L_initial_num_blocks_is_0\@:
-        \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-
-
-.L_initial_blocks_encrypted\@:
-        test    %r13, %r13
-        je      .L_zero_cipher_left\@
-
-        sub     $128, %r13
-        je      .L_eight_cipher_left\@
-
-
-
-
-        vmovd   %xmm9, %r15d
-        and     $255, %r15d
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-
-.L_encrypt_by_8_new\@:
-        cmp     $(255-8), %r15d
-        jg      .L_encrypt_by_8\@
-
-
-
-        add     $8, %r15b
-        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
-        add     $128, %r11
-        sub     $128, %r13
-        jne     .L_encrypt_by_8_new\@
-
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        jmp     .L_eight_cipher_left\@
-
-.L_encrypt_by_8\@:
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        add     $8, %r15b
-        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        add     $128, %r11
-        sub     $128, %r13
-        jne     .L_encrypt_by_8_new\@
-
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-
-
-
-.L_eight_cipher_left\@:
-        \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
-
-
-.L_zero_cipher_left\@:
-        vmovdqu %xmm14, AadHash(arg2)
-        vmovdqu %xmm9, CurCount(arg2)
-
-        # check for 0 length
-        mov     arg5, %r13
-        and     $15, %r13                            # r13 = (arg5 mod 16)
-
-        je      .L_multiple_of_16_bytes\@
-
-        # handle the last <16 Byte block separately
-
-        mov %r13, PBlockLen(arg2)
-
-        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
-        vmovdqu %xmm9, CurCount(arg2)
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
-        vmovdqu %xmm9, PBlockEncKey(arg2)
-
-        cmp $16, arg5
-        jge .L_large_enough_update\@
-
-        lea (arg4,%r11,1), %r10
-        mov %r13, %r12
-
-        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
-
-        lea     SHIFT_MASK+16(%rip), %r12
-        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
-						     # able to shift 16-r13 bytes (r13 is the
-	# number of bytes in plaintext mod 16)
-
-        jmp .L_final_ghash_mul\@
-
-.L_large_enough_update\@:
-        sub $16, %r11
-        add %r13, %r11
-
-        # receive the last <16 Byte block
-        vmovdqu	(arg4, %r11, 1), %xmm1
-
-        sub	%r13, %r11
-        add	$16, %r11
-
-        lea	SHIFT_MASK+16(%rip), %r12
-        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
-        # (r13 is the number of bytes in plaintext mod 16)
-        sub	%r13, %r12
-        # get the appropriate shuffle mask
-        vmovdqu	(%r12), %xmm2
-        # shift right 16-r13 bytes
-        vpshufb  %xmm2, %xmm1, %xmm1
-
-.L_final_ghash_mul\@:
-        .if  \ENC_DEC ==  DEC
-        vmovdqa %xmm1, %xmm2
-        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
-        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
-						     # mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm2, %xmm2
-        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
-        vpxor   %xmm2, %xmm14, %xmm14
-
-        vmovdqu %xmm14, AadHash(arg2)
-        .else
-        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
-        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
-						     # mask out top 16-r13 bytes of xmm9
-        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-        vpxor   %xmm9, %xmm14, %xmm14
-
-        vmovdqu %xmm14, AadHash(arg2)
-        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
-        .endif
-
-
-        #############################
-        # output r13 Bytes
-        vmovq   %xmm9, %rax
-        cmp     $8, %r13
-        jle     .L_less_than_8_bytes_left\@
-
-        mov     %rax, (arg3 , %r11)
-        add     $8, %r11
-        vpsrldq $8, %xmm9, %xmm9
-        vmovq   %xmm9, %rax
-        sub     $8, %r13
-
-.L_less_than_8_bytes_left\@:
-        movb    %al, (arg3 , %r11)
-        add     $1, %r11
-        shr     $8, %rax
-        sub     $1, %r13
-        jne     .L_less_than_8_bytes_left\@
-        #############################
-
-.L_multiple_of_16_bytes\@:
-.endm
-
-
-# GCM_COMPLETE Finishes update of tag of last partial block
-# Output: Authorization Tag (AUTH_TAG)
-# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
-.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
-        vmovdqu AadHash(arg2), %xmm14
-        vmovdqu HashKey(arg2), %xmm13
-
-        mov PBlockLen(arg2), %r12
-        test %r12, %r12
-        je .L_partial_done\@
-
-	#GHASH computation for the last <16 Byte block
-        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-
-.L_partial_done\@:
-        mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
-        shl     $3, %r12                             # convert into number of bits
-        vmovd   %r12d, %xmm15                        # len(A) in xmm15
-
-        mov InLen(arg2), %r12
-        shl     $3, %r12                        # len(C) in bits  (*128)
-        vmovq   %r12, %xmm1
-        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
-        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
-
-        vpxor   %xmm15, %xmm14, %xmm14
-        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
-        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
-
-        vmovdqu OrigIV(arg2), %xmm9
-
-        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
-
-        vpxor   %xmm14, %xmm9, %xmm9
-
-
-
-.L_return_T\@:
-        mov     \AUTH_TAG, %r10              # r10 = authTag
-        mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
-
-        cmp     $16, %r11
-        je      .L_T_16\@
-
-        cmp     $8, %r11
-        jl      .L_T_4\@
-
-.L_T_8\@:
-        vmovq   %xmm9, %rax
-        mov     %rax, (%r10)
-        add     $8, %r10
-        sub     $8, %r11
-        vpsrldq $8, %xmm9, %xmm9
-        test    %r11, %r11
-        je     .L_return_T_done\@
-.L_T_4\@:
-        vmovd   %xmm9, %eax
-        mov     %eax, (%r10)
-        add     $4, %r10
-        sub     $4, %r11
-        vpsrldq     $4, %xmm9, %xmm9
-        test    %r11, %r11
-        je     .L_return_T_done\@
-.L_T_123\@:
-        vmovd     %xmm9, %eax
-        cmp     $2, %r11
-        jl     .L_T_1\@
-        mov     %ax, (%r10)
-        cmp     $2, %r11
-        je     .L_return_T_done\@
-        add     $2, %r10
-        sar     $16, %eax
-.L_T_1\@:
-        mov     %al, (%r10)
-        jmp     .L_return_T_done\@
-
-.L_T_16\@:
-        vmovdqu %xmm9, (%r10)
-
-.L_return_T_done\@:
-.endm
-
-.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
-
-	mov     \AAD, %r10                      # r10 = AAD
-	mov     \AADLEN, %r12                      # r12 = aadLen
-
-
-	mov     %r12, %r11
-
-	vpxor   \T8, \T8, \T8
-	vpxor   \T7, \T7, \T7
-	cmp     $16, %r11
-	jl      .L_get_AAD_rest8\@
-.L_get_AAD_blocks\@:
-	vmovdqu (%r10), \T7
-	vpshufb SHUF_MASK(%rip), \T7, \T7
-	vpxor   \T7, \T8, \T8
-	\GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
-	add     $16, %r10
-	sub     $16, %r12
-	sub     $16, %r11
-	cmp     $16, %r11
-	jge     .L_get_AAD_blocks\@
-	vmovdqu \T8, \T7
-	test    %r11, %r11
-	je      .L_get_AAD_done\@
-
-	vpxor   \T7, \T7, \T7
-
-	/* read the last <16B of AAD. since we have at least 4B of
-	data right after the AAD (the ICV, and maybe some CT), we can
-	read 4B/8B blocks safely, and then get rid of the extra stuff */
-.L_get_AAD_rest8\@:
-	cmp     $4, %r11
-	jle     .L_get_AAD_rest4\@
-	movq    (%r10), \T1
-	add     $8, %r10
-	sub     $8, %r11
-	vpslldq $8, \T1, \T1
-	vpsrldq $8, \T7, \T7
-	vpxor   \T1, \T7, \T7
-	jmp     .L_get_AAD_rest8\@
-.L_get_AAD_rest4\@:
-	test    %r11, %r11
-	jle     .L_get_AAD_rest0\@
-	mov     (%r10), %eax
-	movq    %rax, \T1
-	add     $4, %r10
-	sub     $4, %r11
-	vpslldq $12, \T1, \T1
-	vpsrldq $4, \T7, \T7
-	vpxor   \T1, \T7, \T7
-.L_get_AAD_rest0\@:
-	/* finalize: shift out the extra bytes we read, and align
-	left. since pslldq can only shift by an immediate, we use
-	vpshufb and a pair of shuffle masks */
-	leaq	ALL_F(%rip), %r11
-	subq	%r12, %r11
-	vmovdqu	16(%r11), \T1
-	andq	$~3, %r11
-	vpshufb (%r11), \T7, \T7
-	vpand	\T1, \T7, \T7
-.L_get_AAD_rest_final\@:
-	vpshufb SHUF_MASK(%rip), \T7, \T7
-	vpxor   \T8, \T7, \T7
-	\GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
-
-.L_get_AAD_done\@:
-        vmovdqu \T7, AadHash(arg2)
-.endm
-
-.macro INIT GHASH_MUL PRECOMPUTE
-        mov arg6, %r11
-        mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
-        xor %r11d, %r11d
-        mov %r11, InLen(arg2) # ctx_data.in_length = 0
-
-        mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
-        mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
-        mov arg3, %rax
-        movdqu (%rax), %xmm0
-        movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
-
-        vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
-        movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
-
-        vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
-
-        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
-        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
-        vmovdqa  %xmm6, %xmm2
-        vpsllq   $1, %xmm6, %xmm6
-        vpsrlq   $63, %xmm2, %xmm2
-        vmovdqa  %xmm2, %xmm1
-        vpslldq  $8, %xmm2, %xmm2
-        vpsrldq  $8, %xmm1, %xmm1
-        vpor     %xmm2, %xmm6, %xmm6
-        #reduction
-        vpshufd  $0b00100100, %xmm1, %xmm2
-        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
-        vpand    POLY(%rip), %xmm2, %xmm2
-        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
-        #######################################################################
-        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
-
-        CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
-
-        \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
-.endm
-
-
-# Reads DLEN bytes starting at DPTR and stores in XMMDst
-# where 0 < DLEN < 16
-# Clobbers %rax, DLEN
-.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
-        vpxor \XMMDst, \XMMDst, \XMMDst
-
-        cmp $8, \DLEN
-        jl .L_read_lt8_\@
-        mov (\DPTR), %rax
-        vpinsrq $0, %rax, \XMMDst, \XMMDst
-        sub $8, \DLEN
-        jz .L_done_read_partial_block_\@
-        xor %eax, %eax
-.L_read_next_byte_\@:
-        shl $8, %rax
-        mov 7(\DPTR, \DLEN, 1), %al
-        dec \DLEN
-        jnz .L_read_next_byte_\@
-        vpinsrq $1, %rax, \XMMDst, \XMMDst
-        jmp .L_done_read_partial_block_\@
-.L_read_lt8_\@:
-        xor %eax, %eax
-.L_read_next_byte_lt8_\@:
-        shl $8, %rax
-        mov -1(\DPTR, \DLEN, 1), %al
-        dec \DLEN
-        jnz .L_read_next_byte_lt8_\@
-        vpinsrq $0, %rax, \XMMDst, \XMMDst
-.L_done_read_partial_block_\@:
-.endm
-
-# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
-# between update calls.
-# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
-# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
-# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
-.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
-        AAD_HASH ENC_DEC
-        mov 	PBlockLen(arg2), %r13
-        test	%r13, %r13
-        je	.L_partial_block_done_\@	# Leave Macro if no partial blocks
-        # Read in input data without over reading
-        cmp	$16, \PLAIN_CYPH_LEN
-        jl	.L_fewer_than_16_bytes_\@
-        vmovdqu	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
-        jmp	.L_data_read_\@
-
-.L_fewer_than_16_bytes_\@:
-        lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
-        mov	\PLAIN_CYPH_LEN, %r12
-        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
-
-        mov PBlockLen(arg2), %r13
-
-.L_data_read_\@:				# Finished reading in data
-
-        vmovdqu	PBlockEncKey(arg2), %xmm9
-        vmovdqu	HashKey(arg2), %xmm13
-
-        lea	SHIFT_MASK(%rip), %r12
-
-        # adjust the shuffle mask pointer to be able to shift r13 bytes
-        # r16-r13 is the number of bytes in plaintext mod 16)
-        add	%r13, %r12
-        vmovdqu	(%r12), %xmm2		# get the appropriate shuffle mask
-        vpshufb %xmm2, %xmm9, %xmm9		# shift right r13 bytes
-
-.if  \ENC_DEC ==  DEC
-        vmovdqa	%xmm1, %xmm3
-        pxor	%xmm1, %xmm9		# Ciphertext XOR E(K, Yn)
-
-        mov	\PLAIN_CYPH_LEN, %r10
-        add	%r13, %r10
-        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
-        sub	$16, %r10
-        # Determine if partial block is not being filled and
-        # shift mask accordingly
-        jge	.L_no_extra_mask_1_\@
-        sub	%r10, %r12
-.L_no_extra_mask_1_\@:
-
-        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
-        # get the appropriate mask to mask out bottom r13 bytes of xmm9
-        vpand	%xmm1, %xmm9, %xmm9		# mask out bottom r13 bytes of xmm9
-
-        vpand	%xmm1, %xmm3, %xmm3
-        vmovdqa	SHUF_MASK(%rip), %xmm10
-        vpshufb	%xmm10, %xmm3, %xmm3
-        vpshufb	%xmm2, %xmm3, %xmm3
-        vpxor	%xmm3, \AAD_HASH, \AAD_HASH
-
-        test	%r10, %r10
-        jl	.L_partial_incomplete_1_\@
-
-        # GHASH computation for the last <16 Byte block
-        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-        xor	%eax,%eax
-
-        mov	%rax, PBlockLen(arg2)
-        jmp	.L_dec_done_\@
-.L_partial_incomplete_1_\@:
-        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
-.L_dec_done_\@:
-        vmovdqu	\AAD_HASH, AadHash(arg2)
-.else
-        vpxor	%xmm1, %xmm9, %xmm9			# Plaintext XOR E(K, Yn)
-
-        mov	\PLAIN_CYPH_LEN, %r10
-        add	%r13, %r10
-        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
-        sub	$16, %r10
-        # Determine if partial block is not being filled and
-        # shift mask accordingly
-        jge	.L_no_extra_mask_2_\@
-        sub	%r10, %r12
-.L_no_extra_mask_2_\@:
-
-        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
-        # get the appropriate mask to mask out bottom r13 bytes of xmm9
-        vpand	%xmm1, %xmm9, %xmm9
-
-        vmovdqa	SHUF_MASK(%rip), %xmm1
-        vpshufb %xmm1, %xmm9, %xmm9
-        vpshufb %xmm2, %xmm9, %xmm9
-        vpxor	%xmm9, \AAD_HASH, \AAD_HASH
-
-        test	%r10, %r10
-        jl	.L_partial_incomplete_2_\@
-
-        # GHASH computation for the last <16 Byte block
-        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
-        xor	%eax,%eax
-
-        mov	%rax, PBlockLen(arg2)
-        jmp	.L_encode_done_\@
-.L_partial_incomplete_2_\@:
-        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
-.L_encode_done_\@:
-        vmovdqu	\AAD_HASH, AadHash(arg2)
-
-        vmovdqa	SHUF_MASK(%rip), %xmm10
-        # shuffle xmm9 back to output as ciphertext
-        vpshufb	%xmm10, %xmm9, %xmm9
-        vpshufb	%xmm2, %xmm9, %xmm9
-.endif
-        # output encrypted Bytes
-        test	%r10, %r10
-        jl	.L_partial_fill_\@
-        mov	%r13, %r12
-        mov	$16, %r13
-        # Set r13 to be the number of bytes to write out
-        sub	%r12, %r13
-        jmp	.L_count_set_\@
-.L_partial_fill_\@:
-        mov	\PLAIN_CYPH_LEN, %r13
-.L_count_set_\@:
-        vmovdqa	%xmm9, %xmm0
-        vmovq	%xmm0, %rax
-        cmp	$8, %r13
-        jle	.L_less_than_8_bytes_left_\@
-
-        mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
-        add	$8, \DATA_OFFSET
-        psrldq	$8, %xmm0
-        vmovq	%xmm0, %rax
-        sub	$8, %r13
-.L_less_than_8_bytes_left_\@:
-        movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
-        add	$1, \DATA_OFFSET
-        shr	$8, %rax
-        sub	$1, %r13
-        jne	.L_less_than_8_bytes_left_\@
-.L_partial_block_done_\@:
-.endm # PARTIAL_BLOCK
-
-###############################################################################
-# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
-# Input: A and B (128-bits each, bit-reflected)
-# Output: C = A*B*x mod poly, (i.e. >>1 )
-# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
-# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
-###############################################################################
-.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
-
-        vpshufd         $0b01001110, \GH, \T2
-        vpshufd         $0b01001110, \HK, \T3
-        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
-        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
-
-        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
-        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
-        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
-        vpxor           \GH, \T2,\T2
-        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
-
-        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
-        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
-        vpxor           \T3, \GH, \GH
-        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
-
-        #first phase of the reduction
-        vpslld  $31, \GH, \T2                   # packed right shifting << 31
-        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
-        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
-
-        vpxor   \T3, \T2, \T2                   # xor the shifted versions
-        vpxor   \T4, \T2, \T2
-
-        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
-
-        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
-        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
-
-        #second phase of the reduction
-
-        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
-        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
-        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
-        vpxor   \T3, \T2, \T2                   # xor the shifted versions
-        vpxor   \T4, \T2, \T2
-
-        vpxor   \T5, \T2, \T2
-        vpxor   \T2, \GH, \GH
-        vpxor   \T1, \GH, \GH                   # the result is in GH
-
-
-.endm
-
-.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
-
-        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-        vmovdqa  \HK, \T5
-
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqu  \T1, HashKey_k(arg2)
-
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
-        vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqu  \T1, HashKey_2_k(arg2)
-
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
-        vmovdqu  \T5, HashKey_3(arg2)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqu  \T1, HashKey_3_k(arg2)
-
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
-        vmovdqu  \T5, HashKey_4(arg2)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqu  \T1, HashKey_4_k(arg2)
-
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
-        vmovdqu  \T5, HashKey_5(arg2)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqu  \T1, HashKey_5_k(arg2)
-
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
-        vmovdqu  \T5, HashKey_6(arg2)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqu  \T1, HashKey_6_k(arg2)
-
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
-        vmovdqu  \T5, HashKey_7(arg2)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqu  \T1, HashKey_7_k(arg2)
-
-        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
-        vmovdqu  \T5, HashKey_8(arg2)
-        vpshufd  $0b01001110, \T5, \T1
-        vpxor    \T5, \T1, \T1
-        vmovdqu  \T1, HashKey_8_k(arg2)
-
-.endm
-
-## if a = number of total plaintext bytes
-## b = floor(a/16)
-## num_initial_blocks = b mod 4#
-## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
-## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, arg4 are used as pointers only, not modified
-
-.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
-	i = (8-\num_initial_blocks)
-	setreg
-        vmovdqu AadHash(arg2), reg_i
-
-	# start AES for num_initial_blocks blocks
-	vmovdqu CurCount(arg2), \CTR
-
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
-                vmovdqa \CTR, reg_i
-                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
-	i = (i+1)
-	setreg
-.endr
-
-	vmovdqa  (arg1), \T_key
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-                vpxor   \T_key, reg_i, reg_i
-	i = (i+1)
-	setreg
-.endr
-
-       j = 1
-       setreg
-.rep \REP
-       vmovdqa  16*j(arg1), \T_key
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-        vaesenc \T_key, reg_i, reg_i
-	i = (i+1)
-	setreg
-.endr
-
-       j = (j+1)
-       setreg
-.endr
-
-	vmovdqa  16*j(arg1), \T_key
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-        vaesenclast      \T_key, reg_i, reg_i
-	i = (i+1)
-	setreg
-.endr
-
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-                vmovdqu (arg4, %r11), \T1
-                vpxor   \T1, reg_i, reg_i
-                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
-                add     $16, %r11
-.if  \ENC_DEC == DEC
-                vmovdqa \T1, reg_i
-.endif
-                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
-	i = (i+1)
-	setreg
-.endr
-
-
-	i = (8-\num_initial_blocks)
-	j = (9-\num_initial_blocks)
-	setreg
-
-.rep \num_initial_blocks
-        vpxor    reg_i, reg_j, reg_j
-        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
-	i = (i+1)
-	j = (j+1)
-	setreg
-.endr
-        # XMM8 has the combined result here
-
-        vmovdqa  \XMM8, TMP1(%rsp)
-        vmovdqa  \XMM8, \T3
-
-        cmp     $128, %r13
-        jl      .L_initial_blocks_done\@                  # no need for precomputed constants
-
-###############################################################################
-# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM1
-                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
-
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM2
-                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
-
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM3
-                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
-
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM4
-                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
-
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM5
-                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
-
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM6
-                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
-
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM7
-                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
-
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM8
-                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
-
-                vmovdqa  (arg1), \T_key
-                vpxor    \T_key, \XMM1, \XMM1
-                vpxor    \T_key, \XMM2, \XMM2
-                vpxor    \T_key, \XMM3, \XMM3
-                vpxor    \T_key, \XMM4, \XMM4
-                vpxor    \T_key, \XMM5, \XMM5
-                vpxor    \T_key, \XMM6, \XMM6
-                vpxor    \T_key, \XMM7, \XMM7
-                vpxor    \T_key, \XMM8, \XMM8
-
-               i = 1
-               setreg
-.rep    \REP       # do REP rounds
-                vmovdqa  16*i(arg1), \T_key
-                vaesenc  \T_key, \XMM1, \XMM1
-                vaesenc  \T_key, \XMM2, \XMM2
-                vaesenc  \T_key, \XMM3, \XMM3
-                vaesenc  \T_key, \XMM4, \XMM4
-                vaesenc  \T_key, \XMM5, \XMM5
-                vaesenc  \T_key, \XMM6, \XMM6
-                vaesenc  \T_key, \XMM7, \XMM7
-                vaesenc  \T_key, \XMM8, \XMM8
-               i = (i+1)
-               setreg
-.endr
-
-                vmovdqa  16*i(arg1), \T_key
-                vaesenclast  \T_key, \XMM1, \XMM1
-                vaesenclast  \T_key, \XMM2, \XMM2
-                vaesenclast  \T_key, \XMM3, \XMM3
-                vaesenclast  \T_key, \XMM4, \XMM4
-                vaesenclast  \T_key, \XMM5, \XMM5
-                vaesenclast  \T_key, \XMM6, \XMM6
-                vaesenclast  \T_key, \XMM7, \XMM7
-                vaesenclast  \T_key, \XMM8, \XMM8
-
-                vmovdqu  (arg4, %r11), \T1
-                vpxor    \T1, \XMM1, \XMM1
-                vmovdqu  \XMM1, (arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM1
-                .endif
-
-                vmovdqu  16*1(arg4, %r11), \T1
-                vpxor    \T1, \XMM2, \XMM2
-                vmovdqu  \XMM2, 16*1(arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM2
-                .endif
-
-                vmovdqu  16*2(arg4, %r11), \T1
-                vpxor    \T1, \XMM3, \XMM3
-                vmovdqu  \XMM3, 16*2(arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM3
-                .endif
-
-                vmovdqu  16*3(arg4, %r11), \T1
-                vpxor    \T1, \XMM4, \XMM4
-                vmovdqu  \XMM4, 16*3(arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM4
-                .endif
-
-                vmovdqu  16*4(arg4, %r11), \T1
-                vpxor    \T1, \XMM5, \XMM5
-                vmovdqu  \XMM5, 16*4(arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM5
-                .endif
-
-                vmovdqu  16*5(arg4, %r11), \T1
-                vpxor    \T1, \XMM6, \XMM6
-                vmovdqu  \XMM6, 16*5(arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM6
-                .endif
-
-                vmovdqu  16*6(arg4, %r11), \T1
-                vpxor    \T1, \XMM7, \XMM7
-                vmovdqu  \XMM7, 16*6(arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM7
-                .endif
-
-                vmovdqu  16*7(arg4, %r11), \T1
-                vpxor    \T1, \XMM8, \XMM8
-                vmovdqu  \XMM8, 16*7(arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM8
-                .endif
-
-                add     $128, %r11
-
-                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
-                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
-                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
-
-###############################################################################
-
-.L_initial_blocks_done\@:
-
-.endm
-
-# encrypt 8 blocks at a time
-# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3, arg4 are used as pointers only, not modified
-# r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
-
-        vmovdqa \XMM1, \T2
-        vmovdqa \XMM2, TMP2(%rsp)
-        vmovdqa \XMM3, TMP3(%rsp)
-        vmovdqa \XMM4, TMP4(%rsp)
-        vmovdqa \XMM5, TMP5(%rsp)
-        vmovdqa \XMM6, TMP6(%rsp)
-        vmovdqa \XMM7, TMP7(%rsp)
-        vmovdqa \XMM8, TMP8(%rsp)
-
-.if \loop_idx == in_order
-                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
-                vpaddd  ONE(%rip), \XMM1, \XMM2
-                vpaddd  ONE(%rip), \XMM2, \XMM3
-                vpaddd  ONE(%rip), \XMM3, \XMM4
-                vpaddd  ONE(%rip), \XMM4, \XMM5
-                vpaddd  ONE(%rip), \XMM5, \XMM6
-                vpaddd  ONE(%rip), \XMM6, \XMM7
-                vpaddd  ONE(%rip), \XMM7, \XMM8
-                vmovdqa \XMM8, \CTR
-
-                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
-.else
-                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
-                vpaddd  ONEf(%rip), \XMM1, \XMM2
-                vpaddd  ONEf(%rip), \XMM2, \XMM3
-                vpaddd  ONEf(%rip), \XMM3, \XMM4
-                vpaddd  ONEf(%rip), \XMM4, \XMM5
-                vpaddd  ONEf(%rip), \XMM5, \XMM6
-                vpaddd  ONEf(%rip), \XMM6, \XMM7
-                vpaddd  ONEf(%rip), \XMM7, \XMM8
-                vmovdqa \XMM8, \CTR
-.endif
-
-
-        #######################################################################
-
-                vmovdqu (arg1), \T1
-                vpxor   \T1, \XMM1, \XMM1
-                vpxor   \T1, \XMM2, \XMM2
-                vpxor   \T1, \XMM3, \XMM3
-                vpxor   \T1, \XMM4, \XMM4
-                vpxor   \T1, \XMM5, \XMM5
-                vpxor   \T1, \XMM6, \XMM6
-                vpxor   \T1, \XMM7, \XMM7
-                vpxor   \T1, \XMM8, \XMM8
-
-        #######################################################################
-
-
-
-
-
-                vmovdqu 16*1(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-                vmovdqu 16*2(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-
-        #######################################################################
-
-        vmovdqu         HashKey_8(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
-        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
-
-        vpshufd         $0b01001110, \T2, \T6
-        vpxor           \T2, \T6, \T6
-
-        vmovdqu         HashKey_8_k(arg2), \T5
-        vpclmulqdq      $0x00, \T5, \T6, \T6
-
-                vmovdqu 16*3(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        vmovdqa         TMP2(%rsp), \T1
-        vmovdqu         HashKey_7(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqu         HashKey_7_k(arg2), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*4(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        #######################################################################
-
-        vmovdqa         TMP3(%rsp), \T1
-        vmovdqu         HashKey_6(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqu         HashKey_6_k(arg2), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*5(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        vmovdqa         TMP4(%rsp), \T1
-        vmovdqu         HashKey_5(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqu         HashKey_5_k(arg2), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*6(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-
-        vmovdqa         TMP5(%rsp), \T1
-        vmovdqu         HashKey_4(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqu         HashKey_4_k(arg2), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*7(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        vmovdqa         TMP6(%rsp), \T1
-        vmovdqu         HashKey_3(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqu         HashKey_3_k(arg2), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
-
-
-                vmovdqu 16*8(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        vmovdqa         TMP7(%rsp), \T1
-        vmovdqu         HashKey_2(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqu         HashKey_2_k(arg2), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
-
-        #######################################################################
-
-                vmovdqu 16*9(arg1), \T5
-                vaesenc \T5, \XMM1, \XMM1
-                vaesenc \T5, \XMM2, \XMM2
-                vaesenc \T5, \XMM3, \XMM3
-                vaesenc \T5, \XMM4, \XMM4
-                vaesenc \T5, \XMM5, \XMM5
-                vaesenc \T5, \XMM6, \XMM6
-                vaesenc \T5, \XMM7, \XMM7
-                vaesenc \T5, \XMM8, \XMM8
-
-        vmovdqa         TMP8(%rsp), \T1
-        vmovdqu         HashKey(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpshufd         $0b01001110, \T1, \T3
-        vpxor           \T1, \T3, \T3
-        vmovdqu         HashKey_k(arg2), \T5
-        vpclmulqdq      $0x10, \T5, \T3, \T3
-        vpxor           \T3, \T6, \T6
-
-        vpxor           \T4, \T6, \T6
-        vpxor           \T7, \T6, \T6
-
-                vmovdqu 16*10(arg1), \T5
-
-        i = 11
-        setreg
-.rep (\REP-9)
-
-        vaesenc \T5, \XMM1, \XMM1
-        vaesenc \T5, \XMM2, \XMM2
-        vaesenc \T5, \XMM3, \XMM3
-        vaesenc \T5, \XMM4, \XMM4
-        vaesenc \T5, \XMM5, \XMM5
-        vaesenc \T5, \XMM6, \XMM6
-        vaesenc \T5, \XMM7, \XMM7
-        vaesenc \T5, \XMM8, \XMM8
-
-        vmovdqu 16*i(arg1), \T5
-        i = i + 1
-        setreg
-.endr
-
-	i = 0
-	j = 1
-	setreg
-.rep 8
-		vpxor	16*i(arg4, %r11), \T5, \T2
-                .if \ENC_DEC == ENC
-                vaesenclast     \T2, reg_j, reg_j
-                .else
-                vaesenclast     \T2, reg_j, \T3
-                vmovdqu 16*i(arg4, %r11), reg_j
-                vmovdqu \T3, 16*i(arg3, %r11)
-                .endif
-	i = (i+1)
-	j = (j+1)
-	setreg
-.endr
-	#######################################################################
-
-
-	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
-	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
-	vpxor	\T3, \T7, \T7
-	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
-
-
-
-	#######################################################################
-	#first phase of the reduction
-	#######################################################################
-        vpslld  $31, \T7, \T2                           # packed right shifting << 31
-        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
-        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
-
-        vpxor   \T3, \T2, \T2                           # xor the shifted versions
-        vpxor   \T4, \T2, \T2
-
-        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
-
-        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
-        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
-	#######################################################################
-                .if \ENC_DEC == ENC
-		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
-                .endif
-
-	#######################################################################
-	#second phase of the reduction
-        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
-        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
-        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
-        vpxor   \T3, \T2, \T2                           # xor the shifted versions
-        vpxor   \T4, \T2, \T2
-
-        vpxor   \T1, \T2, \T2
-        vpxor   \T2, \T7, \T7
-        vpxor   \T7, \T6, \T6                           # the result is in T6
-	#######################################################################
-
-		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
-
-
-	vpxor	\T6, \XMM1, \XMM1
-
-
-
-.endm
-
-
-# GHASH the last 4 ciphertext blocks.
-.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
-
-        ## Karatsuba Method
-
-
-        vpshufd         $0b01001110, \XMM1, \T2
-        vpxor           \XMM1, \T2, \T2
-        vmovdqu         HashKey_8(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \XMM1, \T6
-        vpclmulqdq      $0x00, \T5, \XMM1, \T7
-
-        vmovdqu         HashKey_8_k(arg2), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \XMM1
-
-        ######################
-
-        vpshufd         $0b01001110, \XMM2, \T2
-        vpxor           \XMM2, \T2, \T2
-        vmovdqu         HashKey_7(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \XMM2, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM2, \T4
-        vpxor           \T4, \T7, \T7
-
-        vmovdqu         HashKey_7_k(arg2), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vpshufd         $0b01001110, \XMM3, \T2
-        vpxor           \XMM3, \T2, \T2
-        vmovdqu         HashKey_6(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \XMM3, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM3, \T4
-        vpxor           \T4, \T7, \T7
-
-        vmovdqu         HashKey_6_k(arg2), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vpshufd         $0b01001110, \XMM4, \T2
-        vpxor           \XMM4, \T2, \T2
-        vmovdqu         HashKey_5(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \XMM4, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM4, \T4
-        vpxor           \T4, \T7, \T7
-
-        vmovdqu         HashKey_5_k(arg2), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vpshufd         $0b01001110, \XMM5, \T2
-        vpxor           \XMM5, \T2, \T2
-        vmovdqu         HashKey_4(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \XMM5, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM5, \T4
-        vpxor           \T4, \T7, \T7
-
-        vmovdqu         HashKey_4_k(arg2), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vpshufd         $0b01001110, \XMM6, \T2
-        vpxor           \XMM6, \T2, \T2
-        vmovdqu         HashKey_3(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \XMM6, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM6, \T4
-        vpxor           \T4, \T7, \T7
-
-        vmovdqu         HashKey_3_k(arg2), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vpshufd         $0b01001110, \XMM7, \T2
-        vpxor           \XMM7, \T2, \T2
-        vmovdqu         HashKey_2(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \XMM7, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM7, \T4
-        vpxor           \T4, \T7, \T7
-
-        vmovdqu         HashKey_2_k(arg2), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vpshufd         $0b01001110, \XMM8, \T2
-        vpxor           \XMM8, \T2, \T2
-        vmovdqu         HashKey(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \XMM8, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM8, \T4
-        vpxor           \T4, \T7, \T7
-
-        vmovdqu         HashKey_k(arg2), \T3
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-        vpxor           \T6, \XMM1, \XMM1
-        vpxor           \T7, \XMM1, \T2
-
-
-
-
-        vpslldq $8, \T2, \T4
-        vpsrldq $8, \T2, \T2
-
-        vpxor   \T4, \T7, \T7
-        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
-				# the accumulated carry-less multiplications
-
-        #######################################################################
-        #first phase of the reduction
-        vpslld  $31, \T7, \T2   # packed right shifting << 31
-        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
-        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
-
-        vpxor   \T3, \T2, \T2   # xor the shifted versions
-        vpxor   \T4, \T2, \T2
-
-        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
-
-        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
-        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
-        #######################################################################
-
-
-        #second phase of the reduction
-        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
-        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
-        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
-        vpxor   \T3, \T2, \T2   # xor the shifted versions
-        vpxor   \T4, \T2, \T2
-
-        vpxor   \T1, \T2, \T2
-        vpxor   \T2, \T7, \T7
-        vpxor   \T7, \T6, \T6   # the result is in T6
-
-.endm
-
-#############################################################
-#void   aesni_gcm_precomp_avx_gen2
-#        (gcm_data     *my_ctx_data,
-#         gcm_context_data *data,
-#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
-#        u8      *iv, /* Pre-counter block j0: 4 byte salt
-#			(from Security Association) concatenated with 8 byte
-#			Initialisation Vector (from IPSec ESP Payload)
-#			concatenated with 0x00000001. 16-byte aligned pointer. */
-#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
-#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-#############################################################
-SYM_FUNC_START(aesni_gcm_init_avx_gen2)
-        FUNC_SAVE
-        INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
-        FUNC_RESTORE
-        RET
-SYM_FUNC_END(aesni_gcm_init_avx_gen2)
-
-###############################################################################
-#void   aesni_gcm_enc_update_avx_gen2(
-#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
-#        gcm_context_data *data,
-#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
-#        const   u8 *in, /* Plaintext input */
-#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
-        FUNC_SAVE
-        mov     keysize, %eax
-        cmp     $32, %eax
-        je      key_256_enc_update
-        cmp     $16, %eax
-        je      key_128_enc_update
-        # must be 192
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
-        FUNC_RESTORE
-        RET
-key_128_enc_update:
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
-        FUNC_RESTORE
-        RET
-key_256_enc_update:
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
-        FUNC_RESTORE
-        RET
-SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
-
-###############################################################################
-#void   aesni_gcm_dec_update_avx_gen2(
-#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
-#        gcm_context_data *data,
-#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
-#        const   u8 *in, /* Ciphertext input */
-#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
-        FUNC_SAVE
-        mov     keysize,%eax
-        cmp     $32, %eax
-        je      key_256_dec_update
-        cmp     $16, %eax
-        je      key_128_dec_update
-        # must be 192
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
-        FUNC_RESTORE
-        RET
-key_128_dec_update:
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
-        FUNC_RESTORE
-        RET
-key_256_dec_update:
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
-        FUNC_RESTORE
-        RET
-SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
-
-###############################################################################
-#void   aesni_gcm_finalize_avx_gen2(
-#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
-#        gcm_context_data *data,
-#        u8      *auth_tag, /* Authenticated Tag output. */
-#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
-#				Valid values are 16 (most likely), 12 or 8. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
-        FUNC_SAVE
-        mov	keysize,%eax
-        cmp     $32, %eax
-        je      key_256_finalize
-        cmp     $16, %eax
-        je      key_128_finalize
-        # must be 192
-        GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
-        FUNC_RESTORE
-        RET
-key_128_finalize:
-        GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
-        FUNC_RESTORE
-        RET
-key_256_finalize:
-        GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
-        FUNC_RESTORE
-        RET
-SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
-
-###############################################################################
-# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
-# Input: A and B (128-bits each, bit-reflected)
-# Output: C = A*B*x mod poly, (i.e. >>1 )
-# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
-# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
-###############################################################################
-.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
-
-        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
-        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
-        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
-        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
-        vpxor           \T3, \GH, \GH
-
-
-        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
-        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
-
-        vpxor           \T3, \T1, \T1
-        vpxor           \T2, \GH, \GH
-
-        #######################################################################
-        #first phase of the reduction
-        vmovdqa         POLY2(%rip), \T3
-
-        vpclmulqdq      $0x01, \GH, \T3, \T2
-        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
-
-        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
-        #######################################################################
-        #second phase of the reduction
-        vpclmulqdq      $0x00, \GH, \T3, \T2
-        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
-
-        vpclmulqdq      $0x10, \GH, \T3, \GH
-        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
-
-        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
-        #######################################################################
-        vpxor           \T1, \GH, \GH          # the result is in GH
-
-
-.endm
-
-.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
-
-        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-        vmovdqa  \HK, \T5
-        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
-        vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
-
-        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
-        vmovdqu  \T5, HashKey_3(arg2)
-
-        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
-        vmovdqu  \T5, HashKey_4(arg2)
-
-        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
-        vmovdqu  \T5, HashKey_5(arg2)
-
-        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
-        vmovdqu  \T5, HashKey_6(arg2)
-
-        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
-        vmovdqu  \T5, HashKey_7(arg2)
-
-        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
-        vmovdqu  \T5, HashKey_8(arg2)
-
-.endm
-
-## if a = number of total plaintext bytes
-## b = floor(a/16)
-## num_initial_blocks = b mod 4#
-## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
-## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, arg4 are used as pointers only, not modified
-
-.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
-	i = (8-\num_initial_blocks)
-	setreg
-	vmovdqu AadHash(arg2), reg_i
-
-	# start AES for num_initial_blocks blocks
-	vmovdqu CurCount(arg2), \CTR
-
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
-                vmovdqa \CTR, reg_i
-                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
-	i = (i+1)
-	setreg
-.endr
-
-	vmovdqa  (arg1), \T_key
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-                vpxor   \T_key, reg_i, reg_i
-	i = (i+1)
-	setreg
-.endr
-
-	j = 1
-	setreg
-.rep \REP
-	vmovdqa  16*j(arg1), \T_key
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-        vaesenc \T_key, reg_i, reg_i
-	i = (i+1)
-	setreg
-.endr
-
-	j = (j+1)
-	setreg
-.endr
-
-
-	vmovdqa  16*j(arg1), \T_key
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-        vaesenclast      \T_key, reg_i, reg_i
-	i = (i+1)
-	setreg
-.endr
-
-	i = (9-\num_initial_blocks)
-	setreg
-.rep \num_initial_blocks
-                vmovdqu (arg4, %r11), \T1
-                vpxor   \T1, reg_i, reg_i
-                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
-						       # num_initial_blocks blocks
-                add     $16, %r11
-.if  \ENC_DEC == DEC
-                vmovdqa \T1, reg_i
-.endif
-                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
-	i = (i+1)
-	setreg
-.endr
-
-
-	i = (8-\num_initial_blocks)
-	j = (9-\num_initial_blocks)
-	setreg
-
-.rep \num_initial_blocks
-        vpxor    reg_i, reg_j, reg_j
-        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
-	i = (i+1)
-	j = (j+1)
-	setreg
-.endr
-        # XMM8 has the combined result here
-
-        vmovdqa  \XMM8, TMP1(%rsp)
-        vmovdqa  \XMM8, \T3
-
-        cmp     $128, %r13
-        jl      .L_initial_blocks_done\@                  # no need for precomputed constants
-
-###############################################################################
-# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM1
-                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
-
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM2
-                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
-
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM3
-                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
-
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM4
-                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
-
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM5
-                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
-
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM6
-                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
-
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM7
-                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
-
-                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
-                vmovdqa  \CTR, \XMM8
-                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
-
-                vmovdqa  (arg1), \T_key
-                vpxor    \T_key, \XMM1, \XMM1
-                vpxor    \T_key, \XMM2, \XMM2
-                vpxor    \T_key, \XMM3, \XMM3
-                vpxor    \T_key, \XMM4, \XMM4
-                vpxor    \T_key, \XMM5, \XMM5
-                vpxor    \T_key, \XMM6, \XMM6
-                vpxor    \T_key, \XMM7, \XMM7
-                vpxor    \T_key, \XMM8, \XMM8
-
-		i = 1
-		setreg
-.rep    \REP       # do REP rounds
-                vmovdqa  16*i(arg1), \T_key
-                vaesenc  \T_key, \XMM1, \XMM1
-                vaesenc  \T_key, \XMM2, \XMM2
-                vaesenc  \T_key, \XMM3, \XMM3
-                vaesenc  \T_key, \XMM4, \XMM4
-                vaesenc  \T_key, \XMM5, \XMM5
-                vaesenc  \T_key, \XMM6, \XMM6
-                vaesenc  \T_key, \XMM7, \XMM7
-                vaesenc  \T_key, \XMM8, \XMM8
-		i = (i+1)
-		setreg
-.endr
-
-
-                vmovdqa  16*i(arg1), \T_key
-                vaesenclast  \T_key, \XMM1, \XMM1
-                vaesenclast  \T_key, \XMM2, \XMM2
-                vaesenclast  \T_key, \XMM3, \XMM3
-                vaesenclast  \T_key, \XMM4, \XMM4
-                vaesenclast  \T_key, \XMM5, \XMM5
-                vaesenclast  \T_key, \XMM6, \XMM6
-                vaesenclast  \T_key, \XMM7, \XMM7
-                vaesenclast  \T_key, \XMM8, \XMM8
-
-                vmovdqu  (arg4, %r11), \T1
-                vpxor    \T1, \XMM1, \XMM1
-                vmovdqu  \XMM1, (arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM1
-                .endif
-
-                vmovdqu  16*1(arg4, %r11), \T1
-                vpxor    \T1, \XMM2, \XMM2
-                vmovdqu  \XMM2, 16*1(arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM2
-                .endif
-
-                vmovdqu  16*2(arg4, %r11), \T1
-                vpxor    \T1, \XMM3, \XMM3
-                vmovdqu  \XMM3, 16*2(arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM3
-                .endif
-
-                vmovdqu  16*3(arg4, %r11), \T1
-                vpxor    \T1, \XMM4, \XMM4
-                vmovdqu  \XMM4, 16*3(arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM4
-                .endif
-
-                vmovdqu  16*4(arg4, %r11), \T1
-                vpxor    \T1, \XMM5, \XMM5
-                vmovdqu  \XMM5, 16*4(arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM5
-                .endif
-
-                vmovdqu  16*5(arg4, %r11), \T1
-                vpxor    \T1, \XMM6, \XMM6
-                vmovdqu  \XMM6, 16*5(arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM6
-                .endif
-
-                vmovdqu  16*6(arg4, %r11), \T1
-                vpxor    \T1, \XMM7, \XMM7
-                vmovdqu  \XMM7, 16*6(arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM7
-                .endif
-
-                vmovdqu  16*7(arg4, %r11), \T1
-                vpxor    \T1, \XMM8, \XMM8
-                vmovdqu  \XMM8, 16*7(arg3 , %r11)
-                .if   \ENC_DEC == DEC
-                vmovdqa  \T1, \XMM8
-                .endif
-
-                add     $128, %r11
-
-                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
-                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
-							   # the corresponding ciphertext
-                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
-                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
-
-###############################################################################
-
-.L_initial_blocks_done\@:
-
-
-.endm
-
-
-
-# encrypt 8 blocks at a time
-# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3, arg4 are used as pointers only, not modified
-# r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
-
-        vmovdqa \XMM1, \T2
-        vmovdqa \XMM2, TMP2(%rsp)
-        vmovdqa \XMM3, TMP3(%rsp)
-        vmovdqa \XMM4, TMP4(%rsp)
-        vmovdqa \XMM5, TMP5(%rsp)
-        vmovdqa \XMM6, TMP6(%rsp)
-        vmovdqa \XMM7, TMP7(%rsp)
-        vmovdqa \XMM8, TMP8(%rsp)
-
-.if \loop_idx == in_order
-                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
-                vpaddd  ONE(%rip), \XMM1, \XMM2
-                vpaddd  ONE(%rip), \XMM2, \XMM3
-                vpaddd  ONE(%rip), \XMM3, \XMM4
-                vpaddd  ONE(%rip), \XMM4, \XMM5
-                vpaddd  ONE(%rip), \XMM5, \XMM6
-                vpaddd  ONE(%rip), \XMM6, \XMM7
-                vpaddd  ONE(%rip), \XMM7, \XMM8
-                vmovdqa \XMM8, \CTR
-
-                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
-                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
-.else
-                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
-                vpaddd  ONEf(%rip), \XMM1, \XMM2
-                vpaddd  ONEf(%rip), \XMM2, \XMM3
-                vpaddd  ONEf(%rip), \XMM3, \XMM4
-                vpaddd  ONEf(%rip), \XMM4, \XMM5
-                vpaddd  ONEf(%rip), \XMM5, \XMM6
-                vpaddd  ONEf(%rip), \XMM6, \XMM7
-                vpaddd  ONEf(%rip), \XMM7, \XMM8
-                vmovdqa \XMM8, \CTR
-.endif
-
-
-        #######################################################################
-
-                vmovdqu (arg1), \T1
-                vpxor   \T1, \XMM1, \XMM1
-                vpxor   \T1, \XMM2, \XMM2
-                vpxor   \T1, \XMM3, \XMM3
-                vpxor   \T1, \XMM4, \XMM4
-                vpxor   \T1, \XMM5, \XMM5
-                vpxor   \T1, \XMM6, \XMM6
-                vpxor   \T1, \XMM7, \XMM7
-                vpxor   \T1, \XMM8, \XMM8
-
-        #######################################################################
-
-
-
-
-
-                vmovdqu 16*1(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-                vmovdqu 16*2(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-
-        #######################################################################
-
-        vmovdqu         HashKey_8(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
-        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
-        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
-        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
-        vpxor           \T5, \T6, \T6
-
-                vmovdqu 16*3(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        vmovdqa         TMP2(%rsp), \T1
-        vmovdqu         HashKey_7(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpclmulqdq      $0x01, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-        vpclmulqdq      $0x10, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*4(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        #######################################################################
-
-        vmovdqa         TMP3(%rsp), \T1
-        vmovdqu         HashKey_6(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpclmulqdq      $0x01, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-        vpclmulqdq      $0x10, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*5(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        vmovdqa         TMP4(%rsp), \T1
-        vmovdqu         HashKey_5(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpclmulqdq      $0x01, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-        vpclmulqdq      $0x10, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*6(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-
-        vmovdqa         TMP5(%rsp), \T1
-        vmovdqu         HashKey_4(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpclmulqdq      $0x01, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-        vpclmulqdq      $0x10, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*7(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        vmovdqa         TMP6(%rsp), \T1
-        vmovdqu         HashKey_3(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpclmulqdq      $0x01, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-        vpclmulqdq      $0x10, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-                vmovdqu 16*8(arg1), \T1
-                vaesenc \T1, \XMM1, \XMM1
-                vaesenc \T1, \XMM2, \XMM2
-                vaesenc \T1, \XMM3, \XMM3
-                vaesenc \T1, \XMM4, \XMM4
-                vaesenc \T1, \XMM5, \XMM5
-                vaesenc \T1, \XMM6, \XMM6
-                vaesenc \T1, \XMM7, \XMM7
-                vaesenc \T1, \XMM8, \XMM8
-
-        vmovdqa         TMP7(%rsp), \T1
-        vmovdqu         HashKey_2(arg2), \T5
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T4
-
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpclmulqdq      $0x01, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-        vpclmulqdq      $0x10, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-
-        #######################################################################
-
-                vmovdqu 16*9(arg1), \T5
-                vaesenc \T5, \XMM1, \XMM1
-                vaesenc \T5, \XMM2, \XMM2
-                vaesenc \T5, \XMM3, \XMM3
-                vaesenc \T5, \XMM4, \XMM4
-                vaesenc \T5, \XMM5, \XMM5
-                vaesenc \T5, \XMM6, \XMM6
-                vaesenc \T5, \XMM7, \XMM7
-                vaesenc \T5, \XMM8, \XMM8
-
-        vmovdqa         TMP8(%rsp), \T1
-        vmovdqu         HashKey(arg2), \T5
-
-        vpclmulqdq      $0x00, \T5, \T1, \T3
-        vpxor           \T3, \T7, \T7
-
-        vpclmulqdq      $0x01, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-        vpclmulqdq      $0x10, \T5, \T1, \T3
-        vpxor           \T3, \T6, \T6
-
-        vpclmulqdq      $0x11, \T5, \T1, \T3
-        vpxor           \T3, \T4, \T1
-
-
-                vmovdqu 16*10(arg1), \T5
-
-        i = 11
-        setreg
-.rep (\REP-9)
-        vaesenc \T5, \XMM1, \XMM1
-        vaesenc \T5, \XMM2, \XMM2
-        vaesenc \T5, \XMM3, \XMM3
-        vaesenc \T5, \XMM4, \XMM4
-        vaesenc \T5, \XMM5, \XMM5
-        vaesenc \T5, \XMM6, \XMM6
-        vaesenc \T5, \XMM7, \XMM7
-        vaesenc \T5, \XMM8, \XMM8
-
-        vmovdqu 16*i(arg1), \T5
-        i = i + 1
-        setreg
-.endr
-
-	i = 0
-	j = 1
-	setreg
-.rep 8
-		vpxor	16*i(arg4, %r11), \T5, \T2
-                .if \ENC_DEC == ENC
-                vaesenclast     \T2, reg_j, reg_j
-                .else
-                vaesenclast     \T2, reg_j, \T3
-                vmovdqu 16*i(arg4, %r11), reg_j
-                vmovdqu \T3, 16*i(arg3, %r11)
-                .endif
-	i = (i+1)
-	j = (j+1)
-	setreg
-.endr
-	#######################################################################
-
-
-	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
-	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
-	vpxor	\T3, \T7, \T7
-	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
-
-
-
-	#######################################################################
-	#first phase of the reduction
-	vmovdqa         POLY2(%rip), \T3
-
-	vpclmulqdq	$0x01, \T7, \T3, \T2
-	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
-
-	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
-	#######################################################################
-                .if \ENC_DEC == ENC
-		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
-		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
-                .endif
-
-	#######################################################################
-	#second phase of the reduction
-	vpclmulqdq	$0x00, \T7, \T3, \T2
-	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
-
-	vpclmulqdq	$0x10, \T7, \T3, \T4
-	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
-
-	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
-	#######################################################################
-	vpxor		\T4, \T1, \T1			# the result is in T1
-
-		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
-		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
-
-
-	vpxor	\T1, \XMM1, \XMM1
-
-
-
-.endm
-
-
-# GHASH the last 4 ciphertext blocks.
-.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
-
-        ## Karatsuba Method
-
-        vmovdqu         HashKey_8(arg2), \T5
-
-        vpshufd         $0b01001110, \XMM1, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM1, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM1, \T6
-        vpclmulqdq      $0x00, \T5, \XMM1, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \XMM1
-
-        ######################
-
-        vmovdqu         HashKey_7(arg2), \T5
-        vpshufd         $0b01001110, \XMM2, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM2, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM2, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM2, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqu         HashKey_6(arg2), \T5
-        vpshufd         $0b01001110, \XMM3, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM3, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM3, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM3, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqu         HashKey_5(arg2), \T5
-        vpshufd         $0b01001110, \XMM4, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM4, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM4, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM4, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqu         HashKey_4(arg2), \T5
-        vpshufd         $0b01001110, \XMM5, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM5, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM5, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM5, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqu         HashKey_3(arg2), \T5
-        vpshufd         $0b01001110, \XMM6, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM6, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM6, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM6, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqu         HashKey_2(arg2), \T5
-        vpshufd         $0b01001110, \XMM7, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM7, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM7, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM7, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-
-        ######################
-
-        vmovdqu         HashKey(arg2), \T5
-        vpshufd         $0b01001110, \XMM8, \T2
-        vpshufd         $0b01001110, \T5, \T3
-        vpxor           \XMM8, \T2, \T2
-        vpxor           \T5, \T3, \T3
-
-        vpclmulqdq      $0x11, \T5, \XMM8, \T4
-        vpxor           \T4, \T6, \T6
-
-        vpclmulqdq      $0x00, \T5, \XMM8, \T4
-        vpxor           \T4, \T7, \T7
-
-        vpclmulqdq      $0x00, \T3, \T2, \T2
-
-        vpxor           \T2, \XMM1, \XMM1
-        vpxor           \T6, \XMM1, \XMM1
-        vpxor           \T7, \XMM1, \T2
-
-
-
-
-        vpslldq $8, \T2, \T4
-        vpsrldq $8, \T2, \T2
-
-        vpxor   \T4, \T7, \T7
-        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
-						   # accumulated carry-less multiplications
-
-        #######################################################################
-        #first phase of the reduction
-        vmovdqa         POLY2(%rip), \T3
-
-        vpclmulqdq      $0x01, \T7, \T3, \T2
-        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
-
-        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
-        #######################################################################
-
-
-        #second phase of the reduction
-        vpclmulqdq      $0x00, \T7, \T3, \T2
-        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
-
-        vpclmulqdq      $0x10, \T7, \T3, \T4
-        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
-
-        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
-        #######################################################################
-        vpxor           \T4, \T6, \T6              # the result is in T6
-.endm
-
-
-
-#############################################################
-#void   aesni_gcm_init_avx_gen4
-#        (gcm_data     *my_ctx_data,
-#         gcm_context_data *data,
-#        u8      *iv, /* Pre-counter block j0: 4 byte salt
-#			(from Security Association) concatenated with 8 byte
-#			Initialisation Vector (from IPSec ESP Payload)
-#			concatenated with 0x00000001. 16-byte aligned pointer. */
-#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
-#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
-#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-#############################################################
-SYM_FUNC_START(aesni_gcm_init_avx_gen4)
-        FUNC_SAVE
-        INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
-        FUNC_RESTORE
-        RET
-SYM_FUNC_END(aesni_gcm_init_avx_gen4)
-
-###############################################################################
-#void   aesni_gcm_enc_avx_gen4(
-#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
-#        gcm_context_data *data,
-#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
-#        const   u8 *in, /* Plaintext input */
-#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
-        FUNC_SAVE
-        mov     keysize,%eax
-        cmp     $32, %eax
-        je      key_256_enc_update4
-        cmp     $16, %eax
-        je      key_128_enc_update4
-        # must be 192
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
-        FUNC_RESTORE
-	RET
-key_128_enc_update4:
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
-        FUNC_RESTORE
-	RET
-key_256_enc_update4:
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
-        FUNC_RESTORE
-	RET
-SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
-
-###############################################################################
-#void   aesni_gcm_dec_update_avx_gen4(
-#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
-#        gcm_context_data *data,
-#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
-#        const   u8 *in, /* Ciphertext input */
-#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
-        FUNC_SAVE
-        mov     keysize,%eax
-        cmp     $32, %eax
-        je      key_256_dec_update4
-        cmp     $16, %eax
-        je      key_128_dec_update4
-        # must be 192
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
-        FUNC_RESTORE
-        RET
-key_128_dec_update4:
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
-        FUNC_RESTORE
-        RET
-key_256_dec_update4:
-        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
-        FUNC_RESTORE
-        RET
-SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
-
-###############################################################################
-#void   aesni_gcm_finalize_avx_gen4(
-#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
-#        gcm_context_data *data,
-#        u8      *auth_tag, /* Authenticated Tag output. */
-#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
-#                              Valid values are 16 (most likely), 12 or 8. */
-###############################################################################
-SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
-        FUNC_SAVE
-        mov	keysize,%eax
-        cmp     $32, %eax
-        je      key_256_finalize4
-        cmp     $16, %eax
-        je      key_128_finalize4
-        # must be 192
-        GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
-        FUNC_RESTORE
-        RET
-key_128_finalize4:
-        GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
-        FUNC_RESTORE
-        RET
-key_256_finalize4:
-        GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
-        FUNC_RESTORE
-        RET
-SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index ef031655b2d3..cd37de5ec404 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
- * Support for Intel AES-NI instructions. This file contains glue
- * code, the real AES implementation is in intel-aes_asm.S.
+ * Support for AES-NI and VAES instructions.  This file contains glue code.
+ * The real AES implementations are in aesni-intel_asm.S and other .S files.
  *
  * Copyright (C) 2008, Intel Corp.
  *    Author: Huang Ying <ying.huang@intel.com>
@@ -13,6 +13,8 @@
  *             Tadeusz Struk (tadeusz.struk@intel.com)
  *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  *    Copyright (c) 2010, Intel Corporation.
+ *
+ * Copyright 2024 Google LLC
  */
 
 #include <linux/hardirq.h>
@@ -44,41 +46,11 @@
 #define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
 #define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
 
-/* This data is stored at the end of the crypto_tfm struct.
- * It's a type of per "session" data storage location.
- * This needs to be 16 byte aligned.
- */
-struct aesni_rfc4106_gcm_ctx {
-	u8 hash_subkey[16] AESNI_ALIGN_ATTR;
-	struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
-	u8 nonce[4];
-};
-
-struct generic_gcmaes_ctx {
-	u8 hash_subkey[16] AESNI_ALIGN_ATTR;
-	struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
-};
-
 struct aesni_xts_ctx {
 	struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR;
 	struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR;
 };
 
-#define GCM_BLOCK_LEN 16
-
-struct gcm_context_data {
-	/* init, update and finalize context data */
-	u8 aad_hash[GCM_BLOCK_LEN];
-	u64 aad_length;
-	u64 in_length;
-	u8 partial_block_enc_key[GCM_BLOCK_LEN];
-	u8 orig_IV[GCM_BLOCK_LEN];
-	u8 current_counter[GCM_BLOCK_LEN];
-	u64 partial_block_len;
-	u64 unused;
-	u8 hash_keys[GCM_BLOCK_LEN * 16];
-};
-
 static inline void *aes_align_addr(void *addr)
 {
 	if (crypto_tfm_ctx_alignment() >= AESNI_ALIGN)
@@ -103,9 +75,6 @@ asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
 asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
 				  const u8 *in, unsigned int len, u8 *iv);
 
-#define AVX_GEN2_OPTSIZE 640
-#define AVX_GEN4_OPTSIZE 4096
-
 asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 
@@ -118,23 +87,6 @@ asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc);
 
-/* Scatter / Gather routines, with args similar to above */
-asmlinkage void aesni_gcm_init(void *ctx,
-			       struct gcm_context_data *gdata,
-			       u8 *iv,
-			       u8 *hash_subkey, const u8 *aad,
-			       unsigned long aad_len);
-asmlinkage void aesni_gcm_enc_update(void *ctx,
-				     struct gcm_context_data *gdata, u8 *out,
-				     const u8 *in, unsigned long plaintext_len);
-asmlinkage void aesni_gcm_dec_update(void *ctx,
-				     struct gcm_context_data *gdata, u8 *out,
-				     const u8 *in,
-				     unsigned long ciphertext_len);
-asmlinkage void aesni_gcm_finalize(void *ctx,
-				   struct gcm_context_data *gdata,
-				   u8 *auth_tag, unsigned long auth_tag_len);
-
 asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
 		void *keys, u8 *out, unsigned int num_bytes);
 asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
@@ -154,67 +106,6 @@ asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv,
 asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv,
 	const void *keys, u8 *out, unsigned int num_bytes,
 	unsigned int byte_ctr);
-
-/*
- * asmlinkage void aesni_gcm_init_avx_gen2()
- * gcm_data *my_ctx_data, context data
- * u8 *hash_subkey,  the Hash sub key input. Data starts on a 16-byte boundary.
- */
-asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data,
-					struct gcm_context_data *gdata,
-					u8 *iv,
-					u8 *hash_subkey,
-					const u8 *aad,
-					unsigned long aad_len);
-
-asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx,
-				     struct gcm_context_data *gdata, u8 *out,
-				     const u8 *in, unsigned long plaintext_len);
-asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx,
-				     struct gcm_context_data *gdata, u8 *out,
-				     const u8 *in,
-				     unsigned long ciphertext_len);
-asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx,
-				   struct gcm_context_data *gdata,
-				   u8 *auth_tag, unsigned long auth_tag_len);
-
-/*
- * asmlinkage void aesni_gcm_init_avx_gen4()
- * gcm_data *my_ctx_data, context data
- * u8 *hash_subkey,  the Hash sub key input. Data starts on a 16-byte boundary.
- */
-asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data,
-					struct gcm_context_data *gdata,
-					u8 *iv,
-					u8 *hash_subkey,
-					const u8 *aad,
-					unsigned long aad_len);
-
-asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx,
-				     struct gcm_context_data *gdata, u8 *out,
-				     const u8 *in, unsigned long plaintext_len);
-asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx,
-				     struct gcm_context_data *gdata, u8 *out,
-				     const u8 *in,
-				     unsigned long ciphertext_len);
-asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx,
-				   struct gcm_context_data *gdata,
-				   u8 *auth_tag, unsigned long auth_tag_len);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx);
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2);
-
-static inline struct
-aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
-{
-	return aes_align_addr(crypto_aead_ctx(tfm));
-}
-
-static inline struct
-generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm)
-{
-	return aes_align_addr(crypto_aead_ctx(tfm));
-}
 #endif
 
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
@@ -588,280 +479,6 @@ static int xctr_crypt(struct skcipher_request *req)
 	}
 	return err;
 }
-
-static int aes_gcm_derive_hash_subkey(const struct crypto_aes_ctx *aes_key,
-				      u8 hash_subkey[AES_BLOCK_SIZE])
-{
-	static const u8 zeroes[AES_BLOCK_SIZE];
-
-	aes_encrypt(aes_key, hash_subkey, zeroes);
-	return 0;
-}
-
-static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
-				  unsigned int key_len)
-{
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead);
-
-	if (key_len < 4)
-		return -EINVAL;
-
-	/*Account for 4 byte nonce at the end.*/
-	key_len -= 4;
-
-	memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
-
-	return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?:
-	       aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded,
-					  ctx->hash_subkey);
-}
-
-/* This is the Integrity Check Value (aka the authentication tag) length and can
- * be 8, 12 or 16 bytes long. */
-static int common_rfc4106_set_authsize(struct crypto_aead *aead,
-				       unsigned int authsize)
-{
-	switch (authsize) {
-	case 8:
-	case 12:
-	case 16:
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
-				       unsigned int authsize)
-{
-	switch (authsize) {
-	case 4:
-	case 8:
-	case 12:
-	case 13:
-	case 14:
-	case 15:
-	case 16:
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
-			      unsigned int assoclen, u8 *hash_subkey,
-			      u8 *iv, void *aes_ctx, u8 *auth_tag,
-			      unsigned long auth_tag_len)
-{
-	u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8);
-	struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN);
-	unsigned long left = req->cryptlen;
-	struct scatter_walk assoc_sg_walk;
-	struct skcipher_walk walk;
-	bool do_avx, do_avx2;
-	u8 *assocmem = NULL;
-	u8 *assoc;
-	int err;
-
-	if (!enc)
-		left -= auth_tag_len;
-
-	do_avx = (left >= AVX_GEN2_OPTSIZE);
-	do_avx2 = (left >= AVX_GEN4_OPTSIZE);
-
-	/* Linearize assoc, if not already linear */
-	if (req->src->length >= assoclen && req->src->length) {
-		scatterwalk_start(&assoc_sg_walk, req->src);
-		assoc = scatterwalk_map(&assoc_sg_walk);
-	} else {
-		gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
-			      GFP_KERNEL : GFP_ATOMIC;
-
-		/* assoc can be any length, so must be on heap */
-		assocmem = kmalloc(assoclen, flags);
-		if (unlikely(!assocmem))
-			return -ENOMEM;
-		assoc = assocmem;
-
-		scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
-	}
-
-	kernel_fpu_begin();
-	if (static_branch_likely(&gcm_use_avx2) && do_avx2)
-		aesni_gcm_init_avx_gen4(aes_ctx, data, iv, hash_subkey, assoc,
-					assoclen);
-	else if (static_branch_likely(&gcm_use_avx) && do_avx)
-		aesni_gcm_init_avx_gen2(aes_ctx, data, iv, hash_subkey, assoc,
-					assoclen);
-	else
-		aesni_gcm_init(aes_ctx, data, iv, hash_subkey, assoc, assoclen);
-	kernel_fpu_end();
-
-	if (!assocmem)
-		scatterwalk_unmap(assoc);
-	else
-		kfree(assocmem);
-
-	err = enc ? skcipher_walk_aead_encrypt(&walk, req, false)
-		  : skcipher_walk_aead_decrypt(&walk, req, false);
-
-	while (walk.nbytes > 0) {
-		kernel_fpu_begin();
-		if (static_branch_likely(&gcm_use_avx2) && do_avx2) {
-			if (enc)
-				aesni_gcm_enc_update_avx_gen4(aes_ctx, data,
-							      walk.dst.virt.addr,
-							      walk.src.virt.addr,
-							      walk.nbytes);
-			else
-				aesni_gcm_dec_update_avx_gen4(aes_ctx, data,
-							      walk.dst.virt.addr,
-							      walk.src.virt.addr,
-							      walk.nbytes);
-		} else if (static_branch_likely(&gcm_use_avx) && do_avx) {
-			if (enc)
-				aesni_gcm_enc_update_avx_gen2(aes_ctx, data,
-							      walk.dst.virt.addr,
-							      walk.src.virt.addr,
-							      walk.nbytes);
-			else
-				aesni_gcm_dec_update_avx_gen2(aes_ctx, data,
-							      walk.dst.virt.addr,
-							      walk.src.virt.addr,
-							      walk.nbytes);
-		} else if (enc) {
-			aesni_gcm_enc_update(aes_ctx, data, walk.dst.virt.addr,
-					     walk.src.virt.addr, walk.nbytes);
-		} else {
-			aesni_gcm_dec_update(aes_ctx, data, walk.dst.virt.addr,
-					     walk.src.virt.addr, walk.nbytes);
-		}
-		kernel_fpu_end();
-
-		err = skcipher_walk_done(&walk, 0);
-	}
-
-	if (err)
-		return err;
-
-	kernel_fpu_begin();
-	if (static_branch_likely(&gcm_use_avx2) && do_avx2)
-		aesni_gcm_finalize_avx_gen4(aes_ctx, data, auth_tag,
-					    auth_tag_len);
-	else if (static_branch_likely(&gcm_use_avx) && do_avx)
-		aesni_gcm_finalize_avx_gen2(aes_ctx, data, auth_tag,
-					    auth_tag_len);
-	else
-		aesni_gcm_finalize(aes_ctx, data, auth_tag, auth_tag_len);
-	kernel_fpu_end();
-
-	return 0;
-}
-
-static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
-			  u8 *hash_subkey, u8 *iv, void *aes_ctx)
-{
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
-	u8 auth_tag[16];
-	int err;
-
-	err = gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx,
-				 auth_tag, auth_tag_len);
-	if (err)
-		return err;
-
-	scatterwalk_map_and_copy(auth_tag, req->dst,
-				 req->assoclen + req->cryptlen,
-				 auth_tag_len, 1);
-	return 0;
-}
-
-static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
-			  u8 *hash_subkey, u8 *iv, void *aes_ctx)
-{
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
-	u8 auth_tag_msg[16];
-	u8 auth_tag[16];
-	int err;
-
-	err = gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx,
-				 auth_tag, auth_tag_len);
-	if (err)
-		return err;
-
-	/* Copy out original auth_tag */
-	scatterwalk_map_and_copy(auth_tag_msg, req->src,
-				 req->assoclen + req->cryptlen - auth_tag_len,
-				 auth_tag_len, 0);
-
-	/* Compare generated tag with passed in tag. */
-	if (crypto_memneq(auth_tag_msg, auth_tag, auth_tag_len)) {
-		memzero_explicit(auth_tag, sizeof(auth_tag));
-		return -EBADMSG;
-	}
-	return 0;
-}
-
-static int helper_rfc4106_encrypt(struct aead_request *req)
-{
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
-	void *aes_ctx = &(ctx->aes_key_expanded);
-	u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
-	u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
-	unsigned int i;
-	__be32 counter = cpu_to_be32(1);
-
-	/* Assuming we are supporting rfc4106 64-bit extended */
-	/* sequence numbers We need to have the AAD length equal */
-	/* to 16 or 20 bytes */
-	if (unlikely(req->assoclen != 16 && req->assoclen != 20))
-		return -EINVAL;
-
-	/* IV below built */
-	for (i = 0; i < 4; i++)
-		*(iv+i) = ctx->nonce[i];
-	for (i = 0; i < 8; i++)
-		*(iv+4+i) = req->iv[i];
-	*((__be32 *)(iv+12)) = counter;
-
-	return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
-			      aes_ctx);
-}
-
-static int helper_rfc4106_decrypt(struct aead_request *req)
-{
-	__be32 counter = cpu_to_be32(1);
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
-	void *aes_ctx = &(ctx->aes_key_expanded);
-	u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
-	u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
-	unsigned int i;
-
-	if (unlikely(req->assoclen != 16 && req->assoclen != 20))
-		return -EINVAL;
-
-	/* Assuming we are supporting rfc4106 64-bit extended */
-	/* sequence numbers We need to have the AAD length */
-	/* equal to 16 or 20 bytes */
-
-	/* IV below built */
-	for (i = 0; i < 4; i++)
-		*(iv+i) = ctx->nonce[i];
-	for (i = 0; i < 8; i++)
-		*(iv+4+i) = req->iv[i];
-	*((__be32 *)(iv+12)) = counter;
-
-	return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv,
-			      aes_ctx);
-}
 #endif
 
 static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
@@ -1216,11 +833,717 @@ DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700);
 DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800);
 #endif
 
+/* The common part of the x86_64 AES-GCM key struct */
+struct aes_gcm_key {
+	/* Expanded AES key and the AES key length in bytes */
+	struct crypto_aes_ctx aes_key;
+
+	/* RFC4106 nonce (used only by the rfc4106 algorithms) */
+	u32 rfc4106_nonce;
+};
+
+/* Key struct used by the AES-NI implementations of AES-GCM */
+struct aes_gcm_key_aesni {
+	/*
+	 * Common part of the key.  The assembly code requires 16-byte alignment
+	 * for the round keys; we get this by them being located at the start of
+	 * the struct and the whole struct being 16-byte aligned.
+	 */
+	struct aes_gcm_key base;
+
+	/*
+	 * Powers of the hash key H^8 through H^1.  These are 128-bit values.
+	 * They all have an extra factor of x^-1 and are byte-reversed.  16-byte
+	 * alignment is required by the assembly code.
+	 */
+	u64 h_powers[8][2] __aligned(16);
+
+	/*
+	 * h_powers_xored[i] contains the two 64-bit halves of h_powers[i] XOR'd
+	 * together.  It's used for Karatsuba multiplication.  16-byte alignment
+	 * is required by the assembly code.
+	 */
+	u64 h_powers_xored[8] __aligned(16);
+
+	/*
+	 * H^1 times x^64 (and also the usual extra factor of x^-1).  16-byte
+	 * alignment is required by the assembly code.
+	 */
+	u64 h_times_x64[2] __aligned(16);
+};
+#define AES_GCM_KEY_AESNI(key)	\
+	container_of((key), struct aes_gcm_key_aesni, base)
+#define AES_GCM_KEY_AESNI_SIZE	\
+	(sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1)))
+
+/* Key struct used by the VAES + AVX10 implementations of AES-GCM */
+struct aes_gcm_key_avx10 {
+	/*
+	 * Common part of the key.  The assembly code prefers 16-byte alignment
+	 * for the round keys; we get this by them being located at the start of
+	 * the struct and the whole struct being 64-byte aligned.
+	 */
+	struct aes_gcm_key base;
+
+	/*
+	 * Powers of the hash key H^16 through H^1.  These are 128-bit values.
+	 * They all have an extra factor of x^-1 and are byte-reversed.  This
+	 * array is aligned to a 64-byte boundary to make it naturally aligned
+	 * for 512-bit loads, which can improve performance.  (The assembly code
+	 * doesn't *need* the alignment; this is just an optimization.)
+	 */
+	u64 h_powers[16][2] __aligned(64);
+
+	/* Three padding blocks required by the assembly code */
+	u64 padding[3][2];
+};
+#define AES_GCM_KEY_AVX10(key)	\
+	container_of((key), struct aes_gcm_key_avx10, base)
+#define AES_GCM_KEY_AVX10_SIZE	\
+	(sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1)))
+
+/*
+ * These flags are passed to the AES-GCM helper functions to specify the
+ * specific version of AES-GCM (RFC4106 or not), whether it's encryption or
+ * decryption, and which assembly functions should be called.  Assembly
+ * functions are selected using flags instead of function pointers to avoid
+ * indirect calls (which are very expensive on x86) regardless of inlining.
+ */
+#define FLAG_RFC4106	BIT(0)
+#define FLAG_ENC	BIT(1)
+#define FLAG_AVX	BIT(2)
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+#  define FLAG_AVX10_256	BIT(3)
+#  define FLAG_AVX10_512	BIT(4)
+#else
+   /*
+    * This should cause all calls to the AVX10 assembly functions to be
+    * optimized out, avoiding the need to ifdef each call individually.
+    */
+#  define FLAG_AVX10_256	0
+#  define FLAG_AVX10_512	0
+#endif
+
+static inline struct aes_gcm_key *
+aes_gcm_key_get(struct crypto_aead *tfm, int flags)
+{
+	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+		return PTR_ALIGN(crypto_aead_ctx(tfm), 64);
+	else
+		return PTR_ALIGN(crypto_aead_ctx(tfm), 16);
+}
+
+asmlinkage void
+aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key);
+asmlinkage void
+aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key);
+asmlinkage void
+aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key);
+asmlinkage void
+aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key);
+
+static void aes_gcm_precompute(struct aes_gcm_key *key, int flags)
+{
+	/*
+	 * To make things a bit easier on the assembly side, the AVX10
+	 * implementations use the same key format.  Therefore, a single
+	 * function using 256-bit vectors would suffice here.  However, it's
+	 * straightforward to provide a 512-bit one because of how the assembly
+	 * code is structured, and it works nicely because the total size of the
+	 * key powers is a multiple of 512 bits.  So we take advantage of that.
+	 *
+	 * A similar situation applies to the AES-NI implementations.
+	 */
+	if (flags & FLAG_AVX10_512)
+		aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key));
+	else if (flags & FLAG_AVX10_256)
+		aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key));
+	else if (flags & FLAG_AVX)
+		aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key));
+	else
+		aes_gcm_precompute_aesni(AES_GCM_KEY_AESNI(key));
+}
+
+asmlinkage void
+aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
+			 u8 ghash_acc[16], const u8 *aad, int aadlen);
+asmlinkage void
+aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+			     u8 ghash_acc[16], const u8 *aad, int aadlen);
+asmlinkage void
+aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+			      u8 ghash_acc[16], const u8 *aad, int aadlen);
+
+static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16],
+			       const u8 *aad, int aadlen, int flags)
+{
+	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+		aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc,
+					      aad, aadlen);
+	else if (flags & FLAG_AVX)
+		aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc,
+					     aad, aadlen);
+	else
+		aes_gcm_aad_update_aesni(AES_GCM_KEY_AESNI(key), ghash_acc,
+					 aad, aadlen);
+}
+
+asmlinkage void
+aes_gcm_enc_update_aesni(const struct aes_gcm_key_aesni *key,
+			 const u32 le_ctr[4], u8 ghash_acc[16],
+			 const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+			     const u32 le_ctr[4], u8 ghash_acc[16],
+			     const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
+				  const u32 le_ctr[4], u8 ghash_acc[16],
+				  const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
+				  const u32 le_ctr[4], u8 ghash_acc[16],
+				  const u8 *src, u8 *dst, int datalen);
+
+asmlinkage void
+aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key,
+			 const u32 le_ctr[4], u8 ghash_acc[16],
+			 const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key,
+			     const u32 le_ctr[4], u8 ghash_acc[16],
+			     const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
+				  const u32 le_ctr[4], u8 ghash_acc[16],
+				  const u8 *src, u8 *dst, int datalen);
+asmlinkage void
+aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
+				  const u32 le_ctr[4], u8 ghash_acc[16],
+				  const u8 *src, u8 *dst, int datalen);
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline void
+aes_gcm_update(const struct aes_gcm_key *key,
+	       const u32 le_ctr[4], u8 ghash_acc[16],
+	       const u8 *src, u8 *dst, int datalen, int flags)
+{
+	if (flags & FLAG_ENC) {
+		if (flags & FLAG_AVX10_512)
+			aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
+							  le_ctr, ghash_acc,
+							  src, dst, datalen);
+		else if (flags & FLAG_AVX10_256)
+			aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
+							  le_ctr, ghash_acc,
+							  src, dst, datalen);
+		else if (flags & FLAG_AVX)
+			aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key),
+						     le_ctr, ghash_acc,
+						     src, dst, datalen);
+		else
+			aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr,
+						 ghash_acc, src, dst, datalen);
+	} else {
+		if (flags & FLAG_AVX10_512)
+			aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key),
+							  le_ctr, ghash_acc,
+							  src, dst, datalen);
+		else if (flags & FLAG_AVX10_256)
+			aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key),
+							  le_ctr, ghash_acc,
+							  src, dst, datalen);
+		else if (flags & FLAG_AVX)
+			aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key),
+						     le_ctr, ghash_acc,
+						     src, dst, datalen);
+		else
+			aes_gcm_dec_update_aesni(AES_GCM_KEY_AESNI(key),
+						 le_ctr, ghash_acc,
+						 src, dst, datalen);
+	}
+}
+
+asmlinkage void
+aes_gcm_enc_final_aesni(const struct aes_gcm_key_aesni *key,
+			const u32 le_ctr[4], u8 ghash_acc[16],
+			u64 total_aadlen, u64 total_datalen);
+asmlinkage void
+aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key,
+			    const u32 le_ctr[4], u8 ghash_acc[16],
+			    u64 total_aadlen, u64 total_datalen);
+asmlinkage void
+aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+			     const u32 le_ctr[4], u8 ghash_acc[16],
+			     u64 total_aadlen, u64 total_datalen);
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline void
+aes_gcm_enc_final(const struct aes_gcm_key *key,
+		  const u32 le_ctr[4], u8 ghash_acc[16],
+		  u64 total_aadlen, u64 total_datalen, int flags)
+{
+	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+		aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
+					     le_ctr, ghash_acc,
+					     total_aadlen, total_datalen);
+	else if (flags & FLAG_AVX)
+		aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key),
+					    le_ctr, ghash_acc,
+					    total_aadlen, total_datalen);
+	else
+		aes_gcm_enc_final_aesni(AES_GCM_KEY_AESNI(key),
+					le_ctr, ghash_acc,
+					total_aadlen, total_datalen);
+}
+
+asmlinkage bool __must_check
+aes_gcm_dec_final_aesni(const struct aes_gcm_key_aesni *key,
+			const u32 le_ctr[4], const u8 ghash_acc[16],
+			u64 total_aadlen, u64 total_datalen,
+			const u8 tag[16], int taglen);
+asmlinkage bool __must_check
+aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key,
+			    const u32 le_ctr[4], const u8 ghash_acc[16],
+			    u64 total_aadlen, u64 total_datalen,
+			    const u8 tag[16], int taglen);
+asmlinkage bool __must_check
+aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
+			     const u32 le_ctr[4], const u8 ghash_acc[16],
+			     u64 total_aadlen, u64 total_datalen,
+			     const u8 tag[16], int taglen);
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline bool __must_check
+aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4],
+		  u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen,
+		  u8 tag[16], int taglen, int flags)
+{
+	if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512))
+		return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
+						    le_ctr, ghash_acc,
+						    total_aadlen, total_datalen,
+						    tag, taglen);
+	else if (flags & FLAG_AVX)
+		return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key),
+						   le_ctr, ghash_acc,
+						   total_aadlen, total_datalen,
+						   tag, taglen);
+	else
+		return aes_gcm_dec_final_aesni(AES_GCM_KEY_AESNI(key),
+					       le_ctr, ghash_acc,
+					       total_aadlen, total_datalen,
+					       tag, taglen);
+}
+
+/*
+ * This is the Integrity Check Value (aka the authentication tag) length and can
+ * be 8, 12 or 16 bytes long.
+ */
+static int common_rfc4106_set_authsize(struct crypto_aead *aead,
+				       unsigned int authsize)
+{
+	switch (authsize) {
+	case 8:
+	case 12:
+	case 16:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
+				       unsigned int authsize)
+{
+	switch (authsize) {
+	case 4:
+	case 8:
+	case 12:
+	case 13:
+	case 14:
+	case 15:
+	case 16:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * This is the setkey function for the x86_64 implementations of AES-GCM.  It
+ * saves the RFC4106 nonce if applicable, expands the AES key, and precomputes
+ * powers of the hash key.
+ *
+ * To comply with the crypto_aead API, this has to be usable in no-SIMD context.
+ * For that reason, this function includes a portable C implementation of the
+ * needed logic.  However, the portable C implementation is very slow, taking
+ * about the same time as encrypting 37 KB of data.  To be ready for users that
+ * may set a key even somewhat frequently, we therefore also include a SIMD
+ * assembly implementation, expanding the AES key using AES-NI and precomputing
+ * the hash key powers using PCLMULQDQ or VPCLMULQDQ.
+ */
+static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key,
+		      unsigned int keylen, int flags)
+{
+	struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags);
+	int err;
+
+	if (flags & FLAG_RFC4106) {
+		if (keylen < 4)
+			return -EINVAL;
+		keylen -= 4;
+		key->rfc4106_nonce = get_unaligned_be32(raw_key + keylen);
+	}
+
+	/* The assembly code assumes the following offsets. */
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_enc) != 0);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_length) != 480);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512);
+	BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768);
+
+	if (likely(crypto_simd_usable())) {
+		err = aes_check_keylen(keylen);
+		if (err)
+			return err;
+		kernel_fpu_begin();
+		aesni_set_key(&key->aes_key, raw_key, keylen);
+		aes_gcm_precompute(key, flags);
+		kernel_fpu_end();
+	} else {
+		static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = {
+			[0] = 0xc2, [15] = 1
+		};
+		static const u8 x_to_the_63[16] __aligned(__alignof__(be128)) = {
+			[7] = 1,
+		};
+		be128 h1 = {};
+		be128 h;
+		int i;
+
+		err = aes_expandkey(&key->aes_key, raw_key, keylen);
+		if (err)
+			return err;
+
+		/* Encrypt the all-zeroes block to get the hash key H^1 */
+		aes_encrypt(&key->aes_key, (u8 *)&h1, (u8 *)&h1);
+
+		/* Compute H^1 * x^-1 */
+		h = h1;
+		gf128mul_lle(&h, (const be128 *)x_to_the_minus1);
+
+		/* Compute the needed key powers */
+		if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) {
+			struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key);
+
+			for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
+				k->h_powers[i][0] = be64_to_cpu(h.b);
+				k->h_powers[i][1] = be64_to_cpu(h.a);
+				gf128mul_lle(&h, &h1);
+			}
+			memset(k->padding, 0, sizeof(k->padding));
+		} else {
+			struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key);
+
+			for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) {
+				k->h_powers[i][0] = be64_to_cpu(h.b);
+				k->h_powers[i][1] = be64_to_cpu(h.a);
+				k->h_powers_xored[i] = k->h_powers[i][0] ^
+						       k->h_powers[i][1];
+				gf128mul_lle(&h, &h1);
+			}
+			gf128mul_lle(&h1, (const be128 *)x_to_the_63);
+			k->h_times_x64[0] = be64_to_cpu(h1.b);
+			k->h_times_x64[1] = be64_to_cpu(h1.a);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Initialize @ghash_acc, then pass all @assoclen bytes of associated data
+ * (a.k.a. additional authenticated data) from @sg_src through the GHASH update
+ * assembly function.  kernel_fpu_begin() must have already been called.
+ */
+static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16],
+			      struct scatterlist *sg_src, unsigned int assoclen,
+			      int flags)
+{
+	struct scatter_walk walk;
+	/*
+	 * The assembly function requires that the length of any non-last
+	 * segment of associated data be a multiple of 16 bytes, so this
+	 * function does the buffering needed to achieve that.
+	 */
+	unsigned int pos = 0;
+	u8 buf[16];
+
+	memset(ghash_acc, 0, 16);
+	scatterwalk_start(&walk, sg_src);
+
+	while (assoclen) {
+		unsigned int len_this_page = scatterwalk_clamp(&walk, assoclen);
+		void *mapped = scatterwalk_map(&walk);
+		const void *src = mapped;
+		unsigned int len;
+
+		assoclen -= len_this_page;
+		scatterwalk_advance(&walk, len_this_page);
+		if (unlikely(pos)) {
+			len = min(len_this_page, 16 - pos);
+			memcpy(&buf[pos], src, len);
+			pos += len;
+			src += len;
+			len_this_page -= len;
+			if (pos < 16)
+				goto next;
+			aes_gcm_aad_update(key, ghash_acc, buf, 16, flags);
+			pos = 0;
+		}
+		len = len_this_page;
+		if (unlikely(assoclen)) /* Not the last segment yet? */
+			len = round_down(len, 16);
+		aes_gcm_aad_update(key, ghash_acc, src, len, flags);
+		src += len;
+		len_this_page -= len;
+		if (unlikely(len_this_page)) {
+			memcpy(buf, src, len_this_page);
+			pos = len_this_page;
+		}
+next:
+		scatterwalk_unmap(mapped);
+		scatterwalk_pagedone(&walk, 0, assoclen);
+		if (need_resched()) {
+			kernel_fpu_end();
+			kernel_fpu_begin();
+		}
+	}
+	if (unlikely(pos))
+		aes_gcm_aad_update(key, ghash_acc, buf, pos, flags);
+}
+
+
+/* __always_inline to optimize out the branches based on @flags */
+static __always_inline int
+gcm_crypt(struct aead_request *req, int flags)
+{
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	const struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags);
+	unsigned int assoclen = req->assoclen;
+	struct skcipher_walk walk;
+	unsigned int nbytes;
+	u8 ghash_acc[16]; /* GHASH accumulator */
+	u32 le_ctr[4]; /* Counter in little-endian format */
+	int taglen;
+	int err;
+
+	/* Initialize the counter and determine the associated data length. */
+	le_ctr[0] = 2;
+	if (flags & FLAG_RFC4106) {
+		if (unlikely(assoclen != 16 && assoclen != 20))
+			return -EINVAL;
+		assoclen -= 8;
+		le_ctr[1] = get_unaligned_be32(req->iv + 4);
+		le_ctr[2] = get_unaligned_be32(req->iv + 0);
+		le_ctr[3] = key->rfc4106_nonce; /* already byte-swapped */
+	} else {
+		le_ctr[1] = get_unaligned_be32(req->iv + 8);
+		le_ctr[2] = get_unaligned_be32(req->iv + 4);
+		le_ctr[3] = get_unaligned_be32(req->iv + 0);
+	}
+
+	/* Begin walking through the plaintext or ciphertext. */
+	if (flags & FLAG_ENC)
+		err = skcipher_walk_aead_encrypt(&walk, req, false);
+	else
+		err = skcipher_walk_aead_decrypt(&walk, req, false);
+
+	/*
+	 * Since the AES-GCM assembly code requires that at least three assembly
+	 * functions be called to process any message (this is needed to support
+	 * incremental updates cleanly), to reduce overhead we try to do all
+	 * three calls in the same kernel FPU section if possible.  We close the
+	 * section and start a new one if there are multiple data segments or if
+	 * rescheduling is needed while processing the associated data.
+	 */
+	kernel_fpu_begin();
+
+	/* Pass the associated data through GHASH. */
+	gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags);
+
+	/* En/decrypt the data and pass the ciphertext through GHASH. */
+	while ((nbytes = walk.nbytes) != 0) {
+		if (unlikely(nbytes < walk.total)) {
+			/*
+			 * Non-last segment.  In this case, the assembly
+			 * function requires that the length be a multiple of 16
+			 * (AES_BLOCK_SIZE) bytes.  The needed buffering of up
+			 * to 16 bytes is handled by the skcipher_walk.  Here we
+			 * just need to round down to a multiple of 16.
+			 */
+			nbytes = round_down(nbytes, AES_BLOCK_SIZE);
+			aes_gcm_update(key, le_ctr, ghash_acc,
+				       walk.src.virt.addr, walk.dst.virt.addr,
+				       nbytes, flags);
+			le_ctr[0] += nbytes / AES_BLOCK_SIZE;
+			kernel_fpu_end();
+			err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+			kernel_fpu_begin();
+		} else {
+			/* Last segment: process all remaining data. */
+			aes_gcm_update(key, le_ctr, ghash_acc,
+				       walk.src.virt.addr, walk.dst.virt.addr,
+				       nbytes, flags);
+			err = skcipher_walk_done(&walk, 0);
+			/*
+			 * The low word of the counter isn't used by the
+			 * finalize, so there's no need to increment it here.
+			 */
+		}
+	}
+	if (err)
+		goto out;
+
+	/* Finalize */
+	taglen = crypto_aead_authsize(tfm);
+	if (flags & FLAG_ENC) {
+		/* Finish computing the auth tag. */
+		aes_gcm_enc_final(key, le_ctr, ghash_acc, assoclen,
+				  req->cryptlen, flags);
+
+		/* Store the computed auth tag in the dst scatterlist. */
+		scatterwalk_map_and_copy(ghash_acc, req->dst, req->assoclen +
+					 req->cryptlen, taglen, 1);
+	} else {
+		unsigned int datalen = req->cryptlen - taglen;
+		u8 tag[16];
+
+		/* Get the transmitted auth tag from the src scatterlist. */
+		scatterwalk_map_and_copy(tag, req->src, req->assoclen + datalen,
+					 taglen, 0);
+		/*
+		 * Finish computing the auth tag and compare it to the
+		 * transmitted one.  The assembly function does the actual tag
+		 * comparison.  Here, just check the boolean result.
+		 */
+		if (!aes_gcm_dec_final(key, le_ctr, ghash_acc, assoclen,
+				       datalen, tag, taglen, flags))
+			err = -EBADMSG;
+	}
+out:
+	kernel_fpu_end();
+	return err;
+}
+
+#define DEFINE_GCM_ALGS(suffix, flags, generic_driver_name, rfc_driver_name,   \
+			ctxsize, priority)				       \
+									       \
+static int gcm_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key,     \
+			       unsigned int keylen)			       \
+{									       \
+	return gcm_setkey(tfm, raw_key, keylen, (flags));		       \
+}									       \
+									       \
+static int gcm_encrypt_##suffix(struct aead_request *req)		       \
+{									       \
+	return gcm_crypt(req, (flags) | FLAG_ENC);			       \
+}									       \
+									       \
+static int gcm_decrypt_##suffix(struct aead_request *req)		       \
+{									       \
+	return gcm_crypt(req, (flags));					       \
+}									       \
+									       \
+static int rfc4106_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \
+				   unsigned int keylen)			       \
+{									       \
+	return gcm_setkey(tfm, raw_key, keylen, (flags) | FLAG_RFC4106);       \
+}									       \
+									       \
+static int rfc4106_encrypt_##suffix(struct aead_request *req)		       \
+{									       \
+	return gcm_crypt(req, (flags) | FLAG_RFC4106 | FLAG_ENC);	       \
+}									       \
+									       \
+static int rfc4106_decrypt_##suffix(struct aead_request *req)		       \
+{									       \
+	return gcm_crypt(req, (flags) | FLAG_RFC4106);			       \
+}									       \
+									       \
+static struct aead_alg aes_gcm_algs_##suffix[] = { {			       \
+	.setkey			= gcm_setkey_##suffix,			       \
+	.setauthsize		= generic_gcmaes_set_authsize,		       \
+	.encrypt		= gcm_encrypt_##suffix,			       \
+	.decrypt		= gcm_decrypt_##suffix,			       \
+	.ivsize			= GCM_AES_IV_SIZE,			       \
+	.chunksize		= AES_BLOCK_SIZE,			       \
+	.maxauthsize		= 16,					       \
+	.base = {							       \
+		.cra_name		= "__gcm(aes)",			       \
+		.cra_driver_name	= "__" generic_driver_name,	       \
+		.cra_priority		= (priority),			       \
+		.cra_flags		= CRYPTO_ALG_INTERNAL,		       \
+		.cra_blocksize		= 1,				       \
+		.cra_ctxsize		= (ctxsize),			       \
+		.cra_module		= THIS_MODULE,			       \
+	},								       \
+}, {									       \
+	.setkey			= rfc4106_setkey_##suffix,		       \
+	.setauthsize		= common_rfc4106_set_authsize,		       \
+	.encrypt		= rfc4106_encrypt_##suffix,		       \
+	.decrypt		= rfc4106_decrypt_##suffix,		       \
+	.ivsize			= GCM_RFC4106_IV_SIZE,			       \
+	.chunksize		= AES_BLOCK_SIZE,			       \
+	.maxauthsize		= 16,					       \
+	.base = {							       \
+		.cra_name		= "__rfc4106(gcm(aes))",	       \
+		.cra_driver_name	= "__" rfc_driver_name,		       \
+		.cra_priority		= (priority),			       \
+		.cra_flags		= CRYPTO_ALG_INTERNAL,		       \
+		.cra_blocksize		= 1,				       \
+		.cra_ctxsize		= (ctxsize),			       \
+		.cra_module		= THIS_MODULE,			       \
+	},								       \
+} };									       \
+									       \
+static struct simd_aead_alg *aes_gcm_simdalgs_##suffix[2]		       \
+
+/* aes_gcm_algs_aesni */
+DEFINE_GCM_ALGS(aesni, /* no flags */ 0,
+		"generic-gcm-aesni", "rfc4106-gcm-aesni",
+		AES_GCM_KEY_AESNI_SIZE, 400);
+
+/* aes_gcm_algs_aesni_avx */
+DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX,
+		"generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx",
+		AES_GCM_KEY_AESNI_SIZE, 500);
+
+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
+/* aes_gcm_algs_vaes_avx10_256 */
+DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256,
+		"generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256",
+		AES_GCM_KEY_AVX10_SIZE, 700);
+
+/* aes_gcm_algs_vaes_avx10_512 */
+DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512,
+		"generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512",
+		AES_GCM_KEY_AVX10_SIZE, 800);
+#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
+
 /*
  * This is a list of CPU models that are known to suffer from downclocking when
- * zmm registers (512-bit vectors) are used.  On these CPUs, the AES-XTS
- * implementation with zmm registers won't be used by default.  An
- * implementation with ymm registers (256-bit vectors) will be used instead.
+ * zmm registers (512-bit vectors) are used.  On these CPUs, the AES mode
+ * implementations with zmm registers won't be used by default.  Implementations
+ * with ymm registers (256-bit vectors) will be used by default instead.
  */
 static const struct x86_cpu_id zmm_exclusion_list[] = {
 	X86_MATCH_VFM(INTEL_SKYLAKE_X,		0),
@@ -1236,7 +1559,7 @@ static const struct x86_cpu_id zmm_exclusion_list[] = {
 	{},
 };
 
-static int __init register_xts_algs(void)
+static int __init register_avx_algs(void)
 {
 	int err;
 
@@ -1246,6 +1569,11 @@ static int __init register_xts_algs(void)
 					     &aes_xts_simdalg_aesni_avx);
 	if (err)
 		return err;
+	err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx,
+					 ARRAY_SIZE(aes_gcm_algs_aesni_avx),
+					 aes_gcm_simdalgs_aesni_avx);
+	if (err)
+		return err;
 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
 	if (!boot_cpu_has(X86_FEATURE_AVX2) ||
 	    !boot_cpu_has(X86_FEATURE_VAES) ||
@@ -1269,23 +1597,42 @@ static int __init register_xts_algs(void)
 					     &aes_xts_simdalg_vaes_avx10_256);
 	if (err)
 		return err;
+	err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256,
+					 ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
+					 aes_gcm_simdalgs_vaes_avx10_256);
+	if (err)
+		return err;
+
+	if (x86_match_cpu(zmm_exclusion_list)) {
+		int i;
 
-	if (x86_match_cpu(zmm_exclusion_list))
 		aes_xts_alg_vaes_avx10_512.base.cra_priority = 1;
+		for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
+			aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
+	}
 
 	err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1,
 					     &aes_xts_simdalg_vaes_avx10_512);
 	if (err)
 		return err;
+	err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512,
+					 ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512),
+					 aes_gcm_simdalgs_vaes_avx10_512);
+	if (err)
+		return err;
 #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
 	return 0;
 }
 
-static void unregister_xts_algs(void)
+static void unregister_avx_algs(void)
 {
 	if (aes_xts_simdalg_aesni_avx)
 		simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1,
 					  &aes_xts_simdalg_aesni_avx);
+	if (aes_gcm_simdalgs_aesni_avx[0])
+		simd_unregister_aeads(aes_gcm_algs_aesni_avx,
+				      ARRAY_SIZE(aes_gcm_algs_aesni_avx),
+				      aes_gcm_simdalgs_aesni_avx);
 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
 	if (aes_xts_simdalg_vaes_avx2)
 		simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1,
@@ -1293,106 +1640,33 @@ static void unregister_xts_algs(void)
 	if (aes_xts_simdalg_vaes_avx10_256)
 		simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1,
 					  &aes_xts_simdalg_vaes_avx10_256);
+	if (aes_gcm_simdalgs_vaes_avx10_256[0])
+		simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256,
+				      ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256),
+				      aes_gcm_simdalgs_vaes_avx10_256);
 	if (aes_xts_simdalg_vaes_avx10_512)
 		simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1,
 					  &aes_xts_simdalg_vaes_avx10_512);
+	if (aes_gcm_simdalgs_vaes_avx10_512[0])
+		simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512,
+				      ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512),
+				      aes_gcm_simdalgs_vaes_avx10_512);
 #endif
 }
 #else /* CONFIG_X86_64 */
-static int __init register_xts_algs(void)
+static struct aead_alg aes_gcm_algs_aesni[0];
+static struct simd_aead_alg *aes_gcm_simdalgs_aesni[0];
+
+static int __init register_avx_algs(void)
 {
 	return 0;
 }
 
-static void unregister_xts_algs(void)
+static void unregister_avx_algs(void)
 {
 }
 #endif /* !CONFIG_X86_64 */
 
-#ifdef CONFIG_X86_64
-static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key,
-				  unsigned int key_len)
-{
-	struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead);
-
-	return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?:
-	       aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded,
-					  ctx->hash_subkey);
-}
-
-static int generic_gcmaes_encrypt(struct aead_request *req)
-{
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
-	void *aes_ctx = &(ctx->aes_key_expanded);
-	u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
-	u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
-	__be32 counter = cpu_to_be32(1);
-
-	memcpy(iv, req->iv, 12);
-	*((__be32 *)(iv+12)) = counter;
-
-	return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv,
-			      aes_ctx);
-}
-
-static int generic_gcmaes_decrypt(struct aead_request *req)
-{
-	__be32 counter = cpu_to_be32(1);
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm);
-	void *aes_ctx = &(ctx->aes_key_expanded);
-	u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8);
-	u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN);
-
-	memcpy(iv, req->iv, 12);
-	*((__be32 *)(iv+12)) = counter;
-
-	return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv,
-			      aes_ctx);
-}
-
-static struct aead_alg aesni_aeads[] = { {
-	.setkey			= common_rfc4106_set_key,
-	.setauthsize		= common_rfc4106_set_authsize,
-	.encrypt		= helper_rfc4106_encrypt,
-	.decrypt		= helper_rfc4106_decrypt,
-	.ivsize			= GCM_RFC4106_IV_SIZE,
-	.maxauthsize		= 16,
-	.base = {
-		.cra_name		= "__rfc4106(gcm(aes))",
-		.cra_driver_name	= "__rfc4106-gcm-aesni",
-		.cra_priority		= 400,
-		.cra_flags		= CRYPTO_ALG_INTERNAL,
-		.cra_blocksize		= 1,
-		.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx),
-		.cra_alignmask		= 0,
-		.cra_module		= THIS_MODULE,
-	},
-}, {
-	.setkey			= generic_gcmaes_set_key,
-	.setauthsize		= generic_gcmaes_set_authsize,
-	.encrypt		= generic_gcmaes_encrypt,
-	.decrypt		= generic_gcmaes_decrypt,
-	.ivsize			= GCM_AES_IV_SIZE,
-	.maxauthsize		= 16,
-	.base = {
-		.cra_name		= "__gcm(aes)",
-		.cra_driver_name	= "__generic-gcm-aesni",
-		.cra_priority		= 400,
-		.cra_flags		= CRYPTO_ALG_INTERNAL,
-		.cra_blocksize		= 1,
-		.cra_ctxsize		= sizeof(struct generic_gcmaes_ctx),
-		.cra_alignmask		= 0,
-		.cra_module		= THIS_MODULE,
-	},
-} };
-#else
-static struct aead_alg aesni_aeads[0];
-#endif
-
-static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)];
-
 static const struct x86_cpu_id aesni_cpu_id[] = {
 	X86_MATCH_FEATURE(X86_FEATURE_AES, NULL),
 	{}
@@ -1406,17 +1680,6 @@ static int __init aesni_init(void)
 	if (!x86_match_cpu(aesni_cpu_id))
 		return -ENODEV;
 #ifdef CONFIG_X86_64
-	if (boot_cpu_has(X86_FEATURE_AVX2)) {
-		pr_info("AVX2 version of gcm_enc/dec engaged.\n");
-		static_branch_enable(&gcm_use_avx);
-		static_branch_enable(&gcm_use_avx2);
-	} else
-	if (boot_cpu_has(X86_FEATURE_AVX)) {
-		pr_info("AVX version of gcm_enc/dec engaged.\n");
-		static_branch_enable(&gcm_use_avx);
-	} else {
-		pr_info("SSE version of gcm_enc/dec engaged.\n");
-	}
 	if (boot_cpu_has(X86_FEATURE_AVX)) {
 		/* optimize performance of ctr mode encryption transform */
 		static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm);
@@ -1434,8 +1697,9 @@ static int __init aesni_init(void)
 	if (err)
 		goto unregister_cipher;
 
-	err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads),
-					 aesni_simd_aeads);
+	err = simd_register_aeads_compat(aes_gcm_algs_aesni,
+					 ARRAY_SIZE(aes_gcm_algs_aesni),
+					 aes_gcm_simdalgs_aesni);
 	if (err)
 		goto unregister_skciphers;
 
@@ -1447,22 +1711,22 @@ static int __init aesni_init(void)
 		goto unregister_aeads;
 #endif /* CONFIG_X86_64 */
 
-	err = register_xts_algs();
+	err = register_avx_algs();
 	if (err)
-		goto unregister_xts;
+		goto unregister_avx;
 
 	return 0;
 
-unregister_xts:
-	unregister_xts_algs();
+unregister_avx:
+	unregister_avx_algs();
 #ifdef CONFIG_X86_64
 	if (aesni_simd_xctr)
 		simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
 unregister_aeads:
 #endif /* CONFIG_X86_64 */
-	simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
-				aesni_simd_aeads);
-
+	simd_unregister_aeads(aes_gcm_algs_aesni,
+			      ARRAY_SIZE(aes_gcm_algs_aesni),
+			      aes_gcm_simdalgs_aesni);
 unregister_skciphers:
 	simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
 				  aesni_simd_skciphers);
@@ -1473,8 +1737,9 @@ static int __init aesni_init(void)
 
 static void __exit aesni_exit(void)
 {
-	simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads),
-			      aesni_simd_aeads);
+	simd_unregister_aeads(aes_gcm_algs_aesni,
+			      ARRAY_SIZE(aes_gcm_algs_aesni),
+			      aes_gcm_simdalgs_aesni);
 	simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers),
 				  aesni_simd_skciphers);
 	crypto_unregister_alg(&aesni_cipher_alg);
@@ -1482,7 +1747,7 @@ static void __exit aesni_exit(void)
 	if (boot_cpu_has(X86_FEATURE_AVX))
 		simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr);
 #endif /* CONFIG_X86_64 */
-	unregister_xts_algs();
+	unregister_avx_algs();
 }
 
 late_initcall(aesni_init);
-- 
2.46.0

From e2128d28b8bd50d15439421d67af631ed85d8426 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sun, 8 Sep 2024 10:20:46 +0200
Subject: [PATCH 06/12] fixes

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 arch/Kconfig                            |  4 ++--
 arch/x86/kernel/amd_nb.c                |  4 ++++
 drivers/cpufreq/intel_pstate.c          |  4 +++-
 drivers/gpu/drm/nouveau/nouveau_bo.c    |  2 +-
 drivers/gpu/drm/nouveau/nouveau_chan.c  |  2 +-
 drivers/gpu/drm/nouveau/nouveau_dmem.c  |  2 +-
 drivers/gpu/drm/nouveau/nouveau_fence.c | 30 +------------------------
 drivers/gpu/drm/nouveau/nouveau_fence.h |  2 +-
 drivers/gpu/drm/nouveau/nouveau_gem.c   |  2 +-
 drivers/powercap/intel_rapl_common.c    |  1 +
 10 files changed, 16 insertions(+), 37 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 975dd22a2dbd..de69b8f5b5be 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1050,7 +1050,7 @@ config ARCH_MMAP_RND_BITS
 	int "Number of bits to use for ASLR of mmap base address" if EXPERT
 	range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX
 	default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT
-	default ARCH_MMAP_RND_BITS_MIN
+	default ARCH_MMAP_RND_BITS_MAX
 	depends on HAVE_ARCH_MMAP_RND_BITS
 	help
 	  This value can be used to select the number of bits to use to
@@ -1084,7 +1084,7 @@ config ARCH_MMAP_RND_COMPAT_BITS
 	int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT
 	range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX
 	default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT
-	default ARCH_MMAP_RND_COMPAT_BITS_MIN
+	default ARCH_MMAP_RND_COMPAT_BITS_MAX
 	depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS
 	help
 	  This value can be used to select the number of bits to use to
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 027a8c7a2c9e..41043d9b885a 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -43,6 +43,8 @@
 #define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4	0x14f4
 #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4	0x12fc
 #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4	0x12c4
+#define PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4	0x124c
+#define PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4	0x12bc
 #define PCI_DEVICE_ID_AMD_MI200_DF_F4		0x14d4
 #define PCI_DEVICE_ID_AMD_MI300_DF_F4		0x152c
 
@@ -122,6 +124,8 @@ static const struct pci_device_id amd_nb_link_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M60H_DF_F4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F4) },
 	{}
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 1035c074f36a..cc54238596ed 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -3393,7 +3393,9 @@ static const struct x86_cpu_id intel_epp_default[] = {
 	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, HWP_SET_DEF_BALANCE_PERF_EPP(102)),
 	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, HWP_SET_DEF_BALANCE_PERF_EPP(32)),
 	X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, HWP_SET_EPP_VALUES(HWP_EPP_POWERSAVE,
-							HWP_EPP_BALANCE_POWERSAVE, 115, 16)),
+								    179, 64, 16)),
+	X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE, HWP_SET_EPP_VALUES(HWP_EPP_POWERSAVE,
+								 179, 64, 16)),
 	{}
 };
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 70fb003a6666..0712d0b15170 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -898,7 +898,7 @@ nouveau_bo_move_m2mf(struct ttm_buffer_object *bo, int evict,
 	 * Without this the operation can timeout and we'll fallback to a
 	 * software copy, which might take several minutes to finish.
 	 */
-	nouveau_fence_wait(fence, false, false);
+	nouveau_fence_wait(fence, false);
 	ret = ttm_bo_move_accel_cleanup(bo, &fence->base, evict, false,
 					new_reg);
 	nouveau_fence_unref(&fence);
diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c b/drivers/gpu/drm/nouveau/nouveau_chan.c
index 7c97b2886807..66fca95c10c7 100644
--- a/drivers/gpu/drm/nouveau/nouveau_chan.c
+++ b/drivers/gpu/drm/nouveau/nouveau_chan.c
@@ -72,7 +72,7 @@ nouveau_channel_idle(struct nouveau_channel *chan)
 
 		ret = nouveau_fence_new(&fence, chan);
 		if (!ret) {
-			ret = nouveau_fence_wait(fence, false, false);
+			ret = nouveau_fence_wait(fence, false);
 			nouveau_fence_unref(&fence);
 		}
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 6fb65b01d778..6719353e2e13 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -128,7 +128,7 @@ static void nouveau_dmem_page_free(struct page *page)
 static void nouveau_dmem_fence_done(struct nouveau_fence **fence)
 {
 	if (fence) {
-		nouveau_fence_wait(*fence, true, false);
+		nouveau_fence_wait(*fence, false);
 		nouveau_fence_unref(fence);
 	} else {
 		/*
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 93f08f9479d8..ba469767a20f 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -311,39 +311,11 @@ nouveau_fence_wait_legacy(struct dma_fence *f, bool intr, long wait)
 	return timeout - t;
 }
 
-static int
-nouveau_fence_wait_busy(struct nouveau_fence *fence, bool intr)
-{
-	int ret = 0;
-
-	while (!nouveau_fence_done(fence)) {
-		if (time_after_eq(jiffies, fence->timeout)) {
-			ret = -EBUSY;
-			break;
-		}
-
-		__set_current_state(intr ?
-				    TASK_INTERRUPTIBLE :
-				    TASK_UNINTERRUPTIBLE);
-
-		if (intr && signal_pending(current)) {
-			ret = -ERESTARTSYS;
-			break;
-		}
-	}
-
-	__set_current_state(TASK_RUNNING);
-	return ret;
-}
-
 int
-nouveau_fence_wait(struct nouveau_fence *fence, bool lazy, bool intr)
+nouveau_fence_wait(struct nouveau_fence *fence, bool intr)
 {
 	long ret;
 
-	if (!lazy)
-		return nouveau_fence_wait_busy(fence, intr);
-
 	ret = dma_fence_wait_timeout(&fence->base, intr, 15 * HZ);
 	if (ret < 0)
 		return ret;
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h b/drivers/gpu/drm/nouveau/nouveau_fence.h
index 8bc065acfe35..1b63197b744a 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.h
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.h
@@ -23,7 +23,7 @@ void nouveau_fence_unref(struct nouveau_fence **);
 
 int  nouveau_fence_emit(struct nouveau_fence *);
 bool nouveau_fence_done(struct nouveau_fence *);
-int  nouveau_fence_wait(struct nouveau_fence *, bool lazy, bool intr);
+int  nouveau_fence_wait(struct nouveau_fence *, bool intr);
 int  nouveau_fence_sync(struct nouveau_bo *, struct nouveau_channel *, bool exclusive, bool intr);
 
 struct nouveau_fence_chan {
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 5a887d67dc0e..2e535caa7d6e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -928,7 +928,7 @@ nouveau_gem_ioctl_pushbuf(struct drm_device *dev, void *data,
 	}
 
 	if (sync) {
-		if (!(ret = nouveau_fence_wait(fence, false, false))) {
+		if (!(ret = nouveau_fence_wait(fence, false))) {
 			if ((ret = dma_fence_get_status(&fence->base)) == 1)
 				ret = 0;
 		}
diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index aac0744011a3..d51d4ec8d707 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -1285,6 +1285,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
 
 	X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd),
 	X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd),
+	X86_MATCH_VENDOR_FAM(AMD, 0x1A, &rapl_defaults_amd),
 	X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd),
 	{}
 };
-- 
2.46.0

From b2523fdbff8568f9dc194eebd8065ac5ac13d554 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Wed, 4 Sep 2024 16:52:28 +0200
Subject: [PATCH] soundwire: stream: Revert "soundwire: stream: fix programming
 slave ports for non-continous port maps"

This reverts commit ab8d66d132bc8f1992d3eb6cab8d32dda6733c84 because it
breaks codecs using non-continuous masks in source and sink ports.  The
commit missed the point that port numbers are not used as indices for
iterating over prop.sink_ports or prop.source_ports.

Soundwire core and existing codecs expect that the array passed as
prop.sink_ports and prop.source_ports is continuous.  The port mask still
might be non-continuous, but that's unrelated.

Reported-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Closes: https://lore.kernel.org/all/b6c75eee-761d-44c8-8413-2a5b34ee2f98@linux.intel.com/
Fixes: ab8d66d132bc ("soundwire: stream: fix programming slave ports for non-continous port maps")
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Cherry-picked-for: https://gitlab.archlinux.org/archlinux/packaging/packages/linux/-/issues/76
---
 drivers/soundwire/stream.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/soundwire/stream.c b/drivers/soundwire/stream.c
index 00191b1d226014..4e9e7d2a942d8a 100644
--- a/drivers/soundwire/stream.c
+++ b/drivers/soundwire/stream.c
@@ -1286,18 +1286,18 @@ struct sdw_dpn_prop *sdw_get_slave_dpn_prop(struct sdw_slave *slave,
 					    unsigned int port_num)
 {
 	struct sdw_dpn_prop *dpn_prop;
-	unsigned long mask;
+	u8 num_ports;
 	int i;
 
 	if (direction == SDW_DATA_DIR_TX) {
-		mask = slave->prop.source_ports;
+		num_ports = hweight32(slave->prop.source_ports);
 		dpn_prop = slave->prop.src_dpn_prop;
 	} else {
-		mask = slave->prop.sink_ports;
+		num_ports = hweight32(slave->prop.sink_ports);
 		dpn_prop = slave->prop.sink_dpn_prop;
 	}
 
-	for_each_set_bit(i, &mask, 32) {
+	for (i = 0; i < num_ports; i++) {
 		if (dpn_prop[i].num == port_num)
 			return &dpn_prop[i];
 	}
From 90984a4f1e0c0ce2354448ea648bda14c57da75f Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sun, 8 Sep 2024 10:20:59 +0200
Subject: [PATCH 07/12] intel-pstate

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 arch/x86/include/asm/topology.h  |  13 ++
 arch/x86/kernel/cpu/aperfmperf.c |  89 +++++++++++-
 drivers/cpufreq/intel_pstate.c   | 232 ++++++++++++++++++++++++++++++-
 3 files changed, 328 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index abe3a8f22cbd..aef70336d624 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -282,9 +282,22 @@ static inline long arch_scale_freq_capacity(int cpu)
 }
 #define arch_scale_freq_capacity arch_scale_freq_capacity
 
+bool arch_enable_hybrid_capacity_scale(void);
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+			   unsigned long cap_freq, unsigned long base_freq);
+
+unsigned long arch_scale_cpu_capacity(int cpu);
+#define arch_scale_cpu_capacity arch_scale_cpu_capacity
+
 extern void arch_set_max_freq_ratio(bool turbo_disabled);
 extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled);
 #else
+static inline bool arch_enable_hybrid_capacity_scale(void) { return false; }
+static inline void arch_set_cpu_capacity(int cpu, unsigned long cap,
+					 unsigned long max_cap,
+					 unsigned long cap_freq,
+					 unsigned long base_freq) { }
+
 static inline void arch_set_max_freq_ratio(bool turbo_disabled) { }
 static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { }
 #endif
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index b3fa61d45352..f85d3f4233d4 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -347,9 +347,89 @@ static DECLARE_WORK(disable_freq_invariance_work,
 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
 EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
 
+static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
+
+struct arch_hybrid_cpu_scale {
+	unsigned long capacity;
+	unsigned long freq_ratio;
+};
+
+static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
+
+/**
+ * arch_enable_hybrid_capacity_scale - Enable hybrid CPU capacity scaling
+ *
+ * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
+ * initialize it and set the static key controlling its code paths.
+ *
+ * Must be called before arch_set_cpu_capacity().
+ */
+bool arch_enable_hybrid_capacity_scale(void)
+{
+	int cpu;
+
+	if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
+		WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
+		return true;
+	}
+
+	arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
+	if (!arch_cpu_scale)
+		return false;
+
+	for_each_possible_cpu(cpu) {
+		per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
+		per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
+	}
+
+	static_branch_enable(&arch_hybrid_cap_scale_key);
+
+	pr_info("Hybrid CPU capacity scaling enabled\n");
+
+	return true;
+}
+
+/**
+ * arch_set_cpu_capacity - Set scale-invariance parameters for a CPU
+ * @cpu: Target CPU.
+ * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
+ * @max_cap: System-wide maximum CPU capacity.
+ * @cap_freq: Frequency of @cpu corresponding to @cap.
+ * @base_freq: Frequency of @cpu at which MPERF counts.
+ *
+ * The units in which @cap and @max_cap are expressed do not matter, so long
+ * as they are consistent, because the former is effectively divided by the
+ * latter.  Analogously for @cap_freq and @base_freq.
+ *
+ * After calling this function for all CPUs, call arch_rebuild_sched_domains()
+ * to let the scheduler know that capacity-aware scheduling can be used going
+ * forward.
+ */
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+			   unsigned long cap_freq, unsigned long base_freq)
+{
+	if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
+		WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
+			   div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
+		WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
+			   div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
+	} else {
+		WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
+	}
+}
+
+unsigned long arch_scale_cpu_capacity(int cpu)
+{
+	if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+		return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
+
+	return SCHED_CAPACITY_SCALE;
+}
+EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
+
 static void scale_freq_tick(u64 acnt, u64 mcnt)
 {
-	u64 freq_scale;
+	u64 freq_scale, freq_ratio;
 
 	if (!arch_scale_freq_invariant())
 		return;
@@ -357,7 +437,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt)
 	if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
 		goto error;
 
-	if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+	if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+		freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
+	else
+		freq_ratio = arch_max_freq_ratio;
+
+	if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
 		goto error;
 
 	freq_scale = div64_u64(acnt, mcnt);
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index cc54238596ed..4e59a5446861 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -16,6 +16,7 @@
 #include <linux/tick.h>
 #include <linux/slab.h>
 #include <linux/sched/cpufreq.h>
+#include <linux/sched/smt.h>
 #include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
@@ -215,6 +216,7 @@ struct global_params {
  * @hwp_req_cached:	Cached value of the last HWP Request MSR
  * @hwp_cap_cached:	Cached value of the last HWP Capabilities MSR
  * @last_io_update:	Last time when IO wake flag was set
+ * @capacity_perf:	Highest perf used for scale invariance
  * @sched_flags:	Store scheduler flags for possible cross CPU update
  * @hwp_boost_min:	Last HWP boosted min performance
  * @suspended:		Whether or not the driver has been suspended.
@@ -253,6 +255,7 @@ struct cpudata {
 	u64 hwp_req_cached;
 	u64 hwp_cap_cached;
 	u64 last_io_update;
+	unsigned int capacity_perf;
 	unsigned int sched_flags;
 	u32 hwp_boost_min;
 	bool suspended;
@@ -295,6 +298,7 @@ static int hwp_mode_bdw __ro_after_init;
 static bool per_cpu_limits __ro_after_init;
 static bool hwp_forced __ro_after_init;
 static bool hwp_boost __read_mostly;
+static bool hwp_is_hybrid;
 
 static struct cpufreq_driver *intel_pstate_driver __read_mostly;
 
@@ -933,6 +937,135 @@ static struct freq_attr *hwp_cpufreq_attrs[] = {
 	NULL,
 };
 
+static struct cpudata *hybrid_max_perf_cpu __read_mostly;
+/*
+ * Protects hybrid_max_perf_cpu, the capacity_perf fields in struct cpudata,
+ * and the x86 arch scale-invariance information from concurrent updates.
+ */
+static DEFINE_MUTEX(hybrid_capacity_lock);
+
+static void hybrid_set_cpu_capacity(struct cpudata *cpu)
+{
+	arch_set_cpu_capacity(cpu->cpu, cpu->capacity_perf,
+			      hybrid_max_perf_cpu->capacity_perf,
+			      cpu->capacity_perf,
+			      cpu->pstate.max_pstate_physical);
+
+	pr_debug("CPU%d: perf = %u, max. perf = %u, base perf = %d\n", cpu->cpu,
+		 cpu->capacity_perf, hybrid_max_perf_cpu->capacity_perf,
+		 cpu->pstate.max_pstate_physical);
+}
+
+static void hybrid_clear_cpu_capacity(unsigned int cpunum)
+{
+	arch_set_cpu_capacity(cpunum, 1, 1, 1, 1);
+}
+
+static void hybrid_get_capacity_perf(struct cpudata *cpu)
+{
+	if (READ_ONCE(global.no_turbo)) {
+		cpu->capacity_perf = cpu->pstate.max_pstate_physical;
+		return;
+	}
+
+	cpu->capacity_perf = HWP_HIGHEST_PERF(READ_ONCE(cpu->hwp_cap_cached));
+}
+
+static void hybrid_set_capacity_of_cpus(void)
+{
+	int cpunum;
+
+	for_each_online_cpu(cpunum) {
+		struct cpudata *cpu = all_cpu_data[cpunum];
+
+		if (cpu)
+			hybrid_set_cpu_capacity(cpu);
+	}
+}
+
+static void hybrid_update_cpu_scaling(void)
+{
+	struct cpudata *max_perf_cpu = NULL;
+	unsigned int max_cap_perf = 0;
+	int cpunum;
+
+	for_each_online_cpu(cpunum) {
+		struct cpudata *cpu = all_cpu_data[cpunum];
+
+		if (!cpu)
+			continue;
+
+		/*
+		 * During initialization, CPU performance at full capacity needs
+		 * to be determined.
+		 */
+		if (!hybrid_max_perf_cpu)
+			hybrid_get_capacity_perf(cpu);
+
+		/*
+		 * If hybrid_max_perf_cpu is not NULL at this point, it is
+		 * being replaced, so don't take it into account when looking
+		 * for the new one.
+		 */
+		if (cpu == hybrid_max_perf_cpu)
+			continue;
+
+		if (cpu->capacity_perf > max_cap_perf) {
+			max_cap_perf = cpu->capacity_perf;
+			max_perf_cpu = cpu;
+		}
+	}
+
+	if (max_perf_cpu) {
+		hybrid_max_perf_cpu = max_perf_cpu;
+		hybrid_set_capacity_of_cpus();
+	} else {
+		pr_info("Found no CPUs with nonzero maximum performance\n");
+		/* Revert to the flat CPU capacity structure. */
+		for_each_online_cpu(cpunum)
+			hybrid_clear_cpu_capacity(cpunum);
+	}
+}
+
+static void __hybrid_init_cpu_scaling(void)
+{
+	hybrid_max_perf_cpu = NULL;
+	hybrid_update_cpu_scaling();
+}
+
+static void hybrid_init_cpu_scaling(void)
+{
+	bool disable_itmt = false;
+
+	mutex_lock(&hybrid_capacity_lock);
+
+	/*
+	 * If hybrid_max_perf_cpu is set at this point, the hybrid CPU capacity
+	 * scaling has been enabled already and the driver is just changing the
+	 * operation mode.
+	 */
+	if (hybrid_max_perf_cpu) {
+		__hybrid_init_cpu_scaling();
+		goto unlock;
+	}
+
+	/*
+	 * On hybrid systems, use asym capacity instead of ITMT, but because
+	 * the capacity of SMT threads is not deterministic even approximately,
+	 * do not do that when SMT is in use.
+	 */
+	if (hwp_is_hybrid && !sched_smt_active() && arch_enable_hybrid_capacity_scale()) {
+		__hybrid_init_cpu_scaling();
+		disable_itmt = true;
+	}
+
+unlock:
+	mutex_unlock(&hybrid_capacity_lock);
+
+	if (disable_itmt)
+		sched_clear_itmt_support();
+}
+
 static void __intel_pstate_get_hwp_cap(struct cpudata *cpu)
 {
 	u64 cap;
@@ -961,6 +1094,43 @@ static void intel_pstate_get_hwp_cap(struct cpudata *cpu)
 	}
 }
 
+static void hybrid_update_capacity(struct cpudata *cpu)
+{
+	unsigned int max_cap_perf;
+
+	mutex_lock(&hybrid_capacity_lock);
+
+	if (!hybrid_max_perf_cpu)
+		goto unlock;
+
+	/*
+	 * The maximum performance of the CPU may have changed, but assume
+	 * that the performance of the other CPUs has not changed.
+	 */
+	max_cap_perf = hybrid_max_perf_cpu->capacity_perf;
+
+	intel_pstate_get_hwp_cap(cpu);
+
+	hybrid_get_capacity_perf(cpu);
+	/* Should hybrid_max_perf_cpu be replaced by this CPU? */
+	if (cpu->capacity_perf > max_cap_perf) {
+		hybrid_max_perf_cpu = cpu;
+		hybrid_set_capacity_of_cpus();
+		goto unlock;
+	}
+
+	/* If this CPU is hybrid_max_perf_cpu, should it be replaced? */
+	if (cpu == hybrid_max_perf_cpu && cpu->capacity_perf < max_cap_perf) {
+		hybrid_update_cpu_scaling();
+		goto unlock;
+	}
+
+	hybrid_set_cpu_capacity(cpu);
+
+unlock:
+	mutex_unlock(&hybrid_capacity_lock);
+}
+
 static void intel_pstate_hwp_set(unsigned int cpu)
 {
 	struct cpudata *cpu_data = all_cpu_data[cpu];
@@ -1069,6 +1239,22 @@ static void intel_pstate_hwp_offline(struct cpudata *cpu)
 		value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE);
 
 	wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
+
+	mutex_lock(&hybrid_capacity_lock);
+
+	if (!hybrid_max_perf_cpu) {
+		mutex_unlock(&hybrid_capacity_lock);
+
+		return;
+	}
+
+	if (hybrid_max_perf_cpu == cpu)
+		hybrid_update_cpu_scaling();
+
+	mutex_unlock(&hybrid_capacity_lock);
+
+	/* Reset the capacity of the CPU going offline to the initial value. */
+	hybrid_clear_cpu_capacity(cpu->cpu);
 }
 
 #define POWER_CTL_EE_ENABLE	1
@@ -1164,21 +1350,46 @@ static void __intel_pstate_update_max_freq(struct cpudata *cpudata,
 static void intel_pstate_update_limits(unsigned int cpu)
 {
 	struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
+	struct cpudata *cpudata;
 
 	if (!policy)
 		return;
 
-	__intel_pstate_update_max_freq(all_cpu_data[cpu], policy);
+	cpudata = all_cpu_data[cpu];
+
+	__intel_pstate_update_max_freq(cpudata, policy);
+
+	/* Prevent the driver from being unregistered now. */
+	mutex_lock(&intel_pstate_driver_lock);
 
 	cpufreq_cpu_release(policy);
+
+	hybrid_update_capacity(cpudata);
+
+	mutex_unlock(&intel_pstate_driver_lock);
 }
 
 static void intel_pstate_update_limits_for_all(void)
 {
 	int cpu;
 
-	for_each_possible_cpu(cpu)
-		intel_pstate_update_limits(cpu);
+	for_each_possible_cpu(cpu) {
+		struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
+
+		if (!policy)
+			continue;
+
+		__intel_pstate_update_max_freq(all_cpu_data[cpu], policy);
+
+		cpufreq_cpu_release(policy);
+	}
+
+	mutex_lock(&hybrid_capacity_lock);
+
+	if (hybrid_max_perf_cpu)
+		__hybrid_init_cpu_scaling();
+
+	mutex_unlock(&hybrid_capacity_lock);
 }
 
 /************************** sysfs begin ************************/
@@ -1617,6 +1828,13 @@ static void intel_pstate_notify_work(struct work_struct *work)
 		__intel_pstate_update_max_freq(cpudata, policy);
 
 		cpufreq_cpu_release(policy);
+
+		/*
+		 * The driver will not be unregistered while this function is
+		 * running, so update the capacity without acquiring the driver
+		 * lock.
+		 */
+		hybrid_update_capacity(cpudata);
 	}
 
 	wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0);
@@ -2018,8 +2236,10 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
 
 		if (pstate_funcs.get_cpu_scaling) {
 			cpu->pstate.scaling = pstate_funcs.get_cpu_scaling(cpu->cpu);
-			if (cpu->pstate.scaling != perf_ctl_scaling)
+			if (cpu->pstate.scaling != perf_ctl_scaling) {
 				intel_pstate_hybrid_hwp_adjust(cpu);
+				hwp_is_hybrid = true;
+			}
 		} else {
 			cpu->pstate.scaling = perf_ctl_scaling;
 		}
@@ -2687,6 +2907,8 @@ static int intel_pstate_cpu_online(struct cpufreq_policy *policy)
 		 */
 		intel_pstate_hwp_reenable(cpu);
 		cpu->suspended = false;
+
+		hybrid_update_capacity(cpu);
 	}
 
 	return 0;
@@ -3129,6 +3351,8 @@ static int intel_pstate_register_driver(struct cpufreq_driver *driver)
 
 	global.min_perf_pct = min_perf_pct_min();
 
+	hybrid_init_cpu_scaling();
+
 	return 0;
 }
 
-- 
2.46.0

From b503c9c8ebb5f8ab64d6238b83a4c57a78b3bc73 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sun, 8 Sep 2024 10:21:08 +0200
Subject: [PATCH 08/12] ksm

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 arch/alpha/kernel/syscalls/syscall.tbl      |   3 +
 arch/arm/tools/syscall.tbl                  |   3 +
 arch/arm64/include/asm/unistd.h             |   2 +-
 arch/arm64/include/asm/unistd32.h           |   6 +
 arch/m68k/kernel/syscalls/syscall.tbl       |   3 +
 arch/microblaze/kernel/syscalls/syscall.tbl |   3 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   |   3 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   |   3 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   |   3 +
 arch/parisc/kernel/syscalls/syscall.tbl     |   3 +
 arch/powerpc/kernel/syscalls/syscall.tbl    |   3 +
 arch/s390/kernel/syscalls/syscall.tbl       |   3 +
 arch/sh/kernel/syscalls/syscall.tbl         |   3 +
 arch/sparc/kernel/syscalls/syscall.tbl      |   3 +
 arch/x86/entry/syscalls/syscall_32.tbl      |   3 +
 arch/x86/entry/syscalls/syscall_64.tbl      |   3 +
 arch/xtensa/kernel/syscalls/syscall.tbl     |   3 +
 include/linux/syscalls.h                    |   3 +
 include/uapi/asm-generic/unistd.h           |  11 +-
 kernel/sys.c                                | 147 ++++++++++++++++++++
 kernel/sys_ni.c                             |   3 +
 21 files changed, 215 insertions(+), 2 deletions(-)

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 74720667fe09..e6a11f3c0a2e 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -502,3 +502,6 @@
 570	common	lsm_set_self_attr		sys_lsm_set_self_attr
 571	common	lsm_list_modules		sys_lsm_list_modules
 572	common  mseal				sys_mseal
+573	common	process_ksm_enable		sys_process_ksm_enable
+574	common	process_ksm_disable		sys_process_ksm_disable
+575	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 2ed7d229c8f9..3f59e9c5c1ff 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -476,3 +476,6 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	process_ksm_enable		sys_process_ksm_enable
+464	common	process_ksm_disable		sys_process_ksm_disable
+465	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 1346579f802f..f3a77719eb05 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		463
+#define __NR_compat_syscalls		466
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 1386e8e751f2..ccdc523fa4bd 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -931,6 +931,12 @@ __SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr)
 __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
 #define __NR_mseal 462
 __SYSCALL(__NR_mseal, sys_mseal)
+#define __NR_process_ksm_enable 463
+__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable)
+#define __NR_process_ksm_disable 464
+__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable)
+#define __NR_process_ksm_status 465
+__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 22a3cbd4c602..12d2c7594bf0 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -462,3 +462,6 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	process_ksm_enable		sys_process_ksm_enable
+464	common	process_ksm_disable		sys_process_ksm_disable
+465	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 2b81a6bd78b2..e2a93c856eed 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -468,3 +468,6 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	process_ksm_enable		sys_process_ksm_enable
+464	common	process_ksm_disable		sys_process_ksm_disable
+465	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 953f5b7dc723..b921fbf56fa6 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -401,3 +401,6 @@
 460	n32	lsm_set_self_attr		sys_lsm_set_self_attr
 461	n32	lsm_list_modules		sys_lsm_list_modules
 462	n32	mseal				sys_mseal
+463	n32	process_ksm_enable		sys_process_ksm_enable
+464	n32	process_ksm_disable		sys_process_ksm_disable
+465	n32	process_ksm_status		sys_process_ksm_status
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 1464c6be6eb3..8d7f9ddd66f4 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -377,3 +377,6 @@
 460	n64	lsm_set_self_attr		sys_lsm_set_self_attr
 461	n64	lsm_list_modules		sys_lsm_list_modules
 462	n64	mseal				sys_mseal
+463	n64	process_ksm_enable		sys_process_ksm_enable
+464	n64	process_ksm_disable		sys_process_ksm_disable
+465	n64	process_ksm_status		sys_process_ksm_status
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index 2439a2491cff..9d6142739954 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -450,3 +450,6 @@
 460	o32	lsm_set_self_attr		sys_lsm_set_self_attr
 461	o32	lsm_list_modules		sys_lsm_list_modules
 462	o32	mseal				sys_mseal
+463	o32	process_ksm_enable		sys_process_ksm_enable
+464	o32	process_ksm_disable		sys_process_ksm_disable
+465	o32	process_ksm_status		sys_process_ksm_status
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 66dc406b12e4..9d46476fd908 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -461,3 +461,6 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	process_ksm_enable		sys_process_ksm_enable
+464	common	process_ksm_disable		sys_process_ksm_disable
+465	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index ebae8415dfbb..16f71bc2f6f0 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -553,3 +553,6 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	process_ksm_enable		sys_process_ksm_enable
+464	common	process_ksm_disable		sys_process_ksm_disable
+465	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 01071182763e..7394bad8178e 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -465,3 +465,6 @@
 460  common	lsm_set_self_attr	sys_lsm_set_self_attr		sys_lsm_set_self_attr
 461  common	lsm_list_modules	sys_lsm_list_modules		sys_lsm_list_modules
 462  common	mseal			sys_mseal			sys_mseal
+463  common	process_ksm_enable	sys_process_ksm_enable		sys_process_ksm_enable
+464  common	process_ksm_disable	sys_process_ksm_disable		sys_process_ksm_disable
+465  common	process_ksm_status	sys_process_ksm_status		sys_process_ksm_status
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index c55fd7696d40..b9fc31221b87 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -466,3 +466,6 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal				sys_mseal
+463	common	process_ksm_enable		sys_process_ksm_enable
+464	common	process_ksm_disable		sys_process_ksm_disable
+465	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index cfdfb3707c16..0d79fd772854 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -508,3 +508,6 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal 				sys_mseal
+463	common	process_ksm_enable		sys_process_ksm_enable
+464	common	process_ksm_disable		sys_process_ksm_disable
+465	common	process_ksm_status		sys_process_ksm_status
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 4b71a2607bf5..812085b21a3f 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -467,3 +467,6 @@
 460	i386	lsm_set_self_attr	sys_lsm_set_self_attr
 461	i386	lsm_list_modules	sys_lsm_list_modules
 462	i386	mseal 			sys_mseal
+463	i386	process_ksm_enable		sys_process_ksm_enable
+464	i386	process_ksm_disable		sys_process_ksm_disable
+465	i386	process_ksm_status		sys_process_ksm_status
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index a8068f937290..bc2e635389cc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -384,6 +384,9 @@
 460	common	lsm_set_self_attr	sys_lsm_set_self_attr
 461	common	lsm_list_modules	sys_lsm_list_modules
 462 	common  mseal			sys_mseal
+463	common	process_ksm_enable	sys_process_ksm_enable
+464	common	process_ksm_disable	sys_process_ksm_disable
+465	common	process_ksm_status	sys_process_ksm_status
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 67083fc1b2f5..c1aecee4ad9b 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -433,3 +433,6 @@
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
 462	common	mseal 				sys_mseal
+463	common	process_ksm_enable		sys_process_ksm_enable
+464	common	process_ksm_disable		sys_process_ksm_disable
+465	common	process_ksm_status		sys_process_ksm_status
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index fff820c3e93e..ab7d77ddc112 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -818,6 +818,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
 asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec,
 			size_t vlen, int behavior, unsigned int flags);
 asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
+asmlinkage long sys_process_ksm_enable(int pidfd, unsigned int flags);
+asmlinkage long sys_process_ksm_disable(int pidfd, unsigned int flags);
+asmlinkage long sys_process_ksm_status(int pidfd, unsigned int flags);
 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 			unsigned long prot, unsigned long pgoff,
 			unsigned long flags);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index d4cc26932ff4..d191548f6326 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -845,8 +845,17 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
 #define __NR_mseal 462
 __SYSCALL(__NR_mseal, sys_mseal)
 
+#define __NR_process_ksm_enable 463
+__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable)
+
+#define __NR_process_ksm_disable 464
+__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable)
+
+#define __NR_process_ksm_status 465
+__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status)
+
 #undef __NR_syscalls
-#define __NR_syscalls 463
+#define __NR_syscalls 466
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys.c b/kernel/sys.c
index 3a2df1bd9f64..86c6dd9d8c84 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2789,6 +2789,153 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	return error;
 }
 
+#ifdef CONFIG_KSM
+enum pkc_action {
+	PKSM_ENABLE = 0,
+	PKSM_DISABLE,
+	PKSM_STATUS,
+};
+
+static long do_process_ksm_control(int pidfd, enum pkc_action action)
+{
+	long ret;
+	struct pid *pid;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	unsigned int f_flags;
+
+	pid = pidfd_get_pid(pidfd, &f_flags);
+	if (IS_ERR(pid)) {
+		ret = PTR_ERR(pid);
+		goto out;
+	}
+
+	task = get_pid_task(pid, PIDTYPE_PID);
+	if (!task) {
+		ret = -ESRCH;
+		goto put_pid;
+	}
+
+	/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
+	mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
+	if (IS_ERR_OR_NULL(mm)) {
+		ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+		goto release_task;
+	}
+
+	/* Require CAP_SYS_NICE for influencing process performance. */
+	if (!capable(CAP_SYS_NICE)) {
+		ret = -EPERM;
+		goto release_mm;
+	}
+
+	if (mmap_write_lock_killable(mm)) {
+		ret = -EINTR;
+		goto release_mm;
+	}
+
+	switch (action) {
+		case PKSM_ENABLE:
+			ret = ksm_enable_merge_any(mm);
+			break;
+		case PKSM_DISABLE:
+			ret = ksm_disable_merge_any(mm);
+			break;
+		case PKSM_STATUS:
+			ret = !!test_bit(MMF_VM_MERGE_ANY, &mm->flags);
+			break;
+	}
+
+	mmap_write_unlock(mm);
+
+release_mm:
+	mmput(mm);
+release_task:
+	put_task_struct(task);
+put_pid:
+	put_pid(pid);
+out:
+	return ret;
+}
+#endif /* CONFIG_KSM */
+
+SYSCALL_DEFINE2(process_ksm_enable, int, pidfd, unsigned int, flags)
+{
+#ifdef CONFIG_KSM
+	if (flags != 0)
+		return -EINVAL;
+
+	return do_process_ksm_control(pidfd, PKSM_ENABLE);
+#else /* CONFIG_KSM */
+	return -ENOSYS;
+#endif /* CONFIG_KSM */
+}
+
+SYSCALL_DEFINE2(process_ksm_disable, int, pidfd, unsigned int, flags)
+{
+#ifdef CONFIG_KSM
+	if (flags != 0)
+		return -EINVAL;
+
+	return do_process_ksm_control(pidfd, PKSM_DISABLE);
+#else /* CONFIG_KSM */
+	return -ENOSYS;
+#endif /* CONFIG_KSM */
+}
+
+SYSCALL_DEFINE2(process_ksm_status, int, pidfd, unsigned int, flags)
+{
+#ifdef CONFIG_KSM
+	if (flags != 0)
+		return -EINVAL;
+
+	return do_process_ksm_control(pidfd, PKSM_STATUS);
+#else /* CONFIG_KSM */
+	return -ENOSYS;
+#endif /* CONFIG_KSM */
+}
+
+#ifdef CONFIG_KSM
+static ssize_t process_ksm_enable_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", __NR_process_ksm_enable);
+}
+static struct kobj_attribute process_ksm_enable_attr = __ATTR_RO(process_ksm_enable);
+
+static ssize_t process_ksm_disable_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", __NR_process_ksm_disable);
+}
+static struct kobj_attribute process_ksm_disable_attr = __ATTR_RO(process_ksm_disable);
+
+static ssize_t process_ksm_status_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", __NR_process_ksm_status);
+}
+static struct kobj_attribute process_ksm_status_attr = __ATTR_RO(process_ksm_status);
+
+static struct attribute *process_ksm_sysfs_attrs[] = {
+	&process_ksm_enable_attr.attr,
+	&process_ksm_disable_attr.attr,
+	&process_ksm_status_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group process_ksm_sysfs_attr_group = {
+	.attrs = process_ksm_sysfs_attrs,
+	.name = "process_ksm",
+};
+
+static int __init process_ksm_sysfs_init(void)
+{
+	return sysfs_create_group(kernel_kobj, &process_ksm_sysfs_attr_group);
+}
+subsys_initcall(process_ksm_sysfs_init);
+#endif /* CONFIG_KSM */
+
 SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
 		struct getcpu_cache __user *, unused)
 {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index b696b85ac63e..cf7f3d841b1e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -188,6 +188,9 @@ COND_SYSCALL(mincore);
 COND_SYSCALL(madvise);
 COND_SYSCALL(process_madvise);
 COND_SYSCALL(process_mrelease);
+COND_SYSCALL(process_ksm_enable);
+COND_SYSCALL(process_ksm_disable);
+COND_SYSCALL(process_ksm_status);
 COND_SYSCALL(remap_file_pages);
 COND_SYSCALL(mbind);
 COND_SYSCALL(get_mempolicy);
-- 
2.46.0

From 8fb09a4def39f91a1641028f6e03cdf8a35f8dae Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sun, 8 Sep 2024 10:21:31 +0200
Subject: [PATCH 09/12] ntsync

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 Documentation/userspace-api/index.rst         |    1 +
 Documentation/userspace-api/ntsync.rst        |  398 +++++
 MAINTAINERS                                   |    9 +
 drivers/misc/Kconfig                          |    1 -
 drivers/misc/ntsync.c                         |  989 +++++++++++-
 include/uapi/linux/ntsync.h                   |   39 +
 tools/testing/selftests/Makefile              |    1 +
 .../selftests/drivers/ntsync/.gitignore       |    1 +
 .../testing/selftests/drivers/ntsync/Makefile |    7 +
 tools/testing/selftests/drivers/ntsync/config |    1 +
 .../testing/selftests/drivers/ntsync/ntsync.c | 1407 +++++++++++++++++
 11 files changed, 2850 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/userspace-api/ntsync.rst
 create mode 100644 tools/testing/selftests/drivers/ntsync/.gitignore
 create mode 100644 tools/testing/selftests/drivers/ntsync/Makefile
 create mode 100644 tools/testing/selftests/drivers/ntsync/config
 create mode 100644 tools/testing/selftests/drivers/ntsync/ntsync.c

diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst
index 8a251d71fa6e..02bea81fb4bf 100644
--- a/Documentation/userspace-api/index.rst
+++ b/Documentation/userspace-api/index.rst
@@ -64,6 +64,7 @@ Everything else
    vduse
    futex2
    perf_ring_buffer
+   ntsync
 
 .. only::  subproject and html
 
diff --git a/Documentation/userspace-api/ntsync.rst b/Documentation/userspace-api/ntsync.rst
new file mode 100644
index 000000000000..767844637a7d
--- /dev/null
+++ b/Documentation/userspace-api/ntsync.rst
@@ -0,0 +1,398 @@
+===================================
+NT synchronization primitive driver
+===================================
+
+This page documents the user-space API for the ntsync driver.
+
+ntsync is a support driver for emulation of NT synchronization
+primitives by user-space NT emulators. It exists because implementation
+in user-space, using existing tools, cannot match Windows performance
+while offering accurate semantics. It is implemented entirely in
+software, and does not drive any hardware device.
+
+This interface is meant as a compatibility tool only, and should not
+be used for general synchronization. Instead use generic, versatile
+interfaces such as futex(2) and poll(2).
+
+Synchronization primitives
+==========================
+
+The ntsync driver exposes three types of synchronization primitives:
+semaphores, mutexes, and events.
+
+A semaphore holds a single volatile 32-bit counter, and a static 32-bit
+integer denoting the maximum value. It is considered signaled (that is,
+can be acquired without contention, or will wake up a waiting thread)
+when the counter is nonzero. The counter is decremented by one when a
+wait is satisfied. Both the initial and maximum count are established
+when the semaphore is created.
+
+A mutex holds a volatile 32-bit recursion count, and a volatile 32-bit
+identifier denoting its owner. A mutex is considered signaled when its
+owner is zero (indicating that it is not owned). The recursion count is
+incremented when a wait is satisfied, and ownership is set to the given
+identifier.
+
+A mutex also holds an internal flag denoting whether its previous owner
+has died; such a mutex is said to be abandoned. Owner death is not
+tracked automatically based on thread death, but rather must be
+communicated using ``NTSYNC_IOC_MUTEX_KILL``. An abandoned mutex is
+inherently considered unowned.
+
+Except for the "unowned" semantics of zero, the actual value of the
+owner identifier is not interpreted by the ntsync driver at all. The
+intended use is to store a thread identifier; however, the ntsync
+driver does not actually validate that a calling thread provides
+consistent or unique identifiers.
+
+An event is similar to a semaphore with a maximum count of one. It holds
+a volatile boolean state denoting whether it is signaled or not. There
+are two types of events, auto-reset and manual-reset. An auto-reset
+event is designaled when a wait is satisfied; a manual-reset event is
+not. The event type is specified when the event is created.
+
+Unless specified otherwise, all operations on an object are atomic and
+totally ordered with respect to other operations on the same object.
+
+Objects are represented by files. When all file descriptors to an
+object are closed, that object is deleted.
+
+Char device
+===========
+
+The ntsync driver creates a single char device /dev/ntsync. Each file
+description opened on the device represents a unique instance intended
+to back an individual NT virtual machine. Objects created by one ntsync
+instance may only be used with other objects created by the same
+instance.
+
+ioctl reference
+===============
+
+All operations on the device are done through ioctls. There are four
+structures used in ioctl calls::
+
+   struct ntsync_sem_args {
+   	__u32 sem;
+   	__u32 count;
+   	__u32 max;
+   };
+
+   struct ntsync_mutex_args {
+   	__u32 mutex;
+   	__u32 owner;
+   	__u32 count;
+   };
+
+   struct ntsync_event_args {
+   	__u32 event;
+   	__u32 signaled;
+   	__u32 manual;
+   };
+
+   struct ntsync_wait_args {
+   	__u64 timeout;
+   	__u64 objs;
+   	__u32 count;
+   	__u32 owner;
+   	__u32 index;
+   	__u32 alert;
+   	__u32 flags;
+   	__u32 pad;
+   };
+
+Depending on the ioctl, members of the structure may be used as input,
+output, or not at all. All ioctls return 0 on success.
+
+The ioctls on the device file are as follows:
+
+.. c:macro:: NTSYNC_IOC_CREATE_SEM
+
+  Create a semaphore object. Takes a pointer to struct
+  :c:type:`ntsync_sem_args`, which is used as follows:
+
+  .. list-table::
+
+     * - ``sem``
+       - On output, contains a file descriptor to the created semaphore.
+     * - ``count``
+       - Initial count of the semaphore.
+     * - ``max``
+       - Maximum count of the semaphore.
+
+  Fails with ``EINVAL`` if ``count`` is greater than ``max``.
+
+.. c:macro:: NTSYNC_IOC_CREATE_MUTEX
+
+  Create a mutex object. Takes a pointer to struct
+  :c:type:`ntsync_mutex_args`, which is used as follows:
+
+  .. list-table::
+
+     * - ``mutex``
+       - On output, contains a file descriptor to the created mutex.
+     * - ``count``
+       - Initial recursion count of the mutex.
+     * - ``owner``
+       - Initial owner of the mutex.
+
+  If ``owner`` is nonzero and ``count`` is zero, or if ``owner`` is
+  zero and ``count`` is nonzero, the function fails with ``EINVAL``.
+
+.. c:macro:: NTSYNC_IOC_CREATE_EVENT
+
+  Create an event object. Takes a pointer to struct
+  :c:type:`ntsync_event_args`, which is used as follows:
+
+  .. list-table::
+
+     * - ``event``
+       - On output, contains a file descriptor to the created event.
+     * - ``signaled``
+       - If nonzero, the event is initially signaled, otherwise
+         nonsignaled.
+     * - ``manual``
+       - If nonzero, the event is a manual-reset event, otherwise
+         auto-reset.
+
+The ioctls on the individual objects are as follows:
+
+.. c:macro:: NTSYNC_IOC_SEM_POST
+
+  Post to a semaphore object. Takes a pointer to a 32-bit integer,
+  which on input holds the count to be added to the semaphore, and on
+  output contains its previous count.
+
+  If adding to the semaphore's current count would raise the latter
+  past the semaphore's maximum count, the ioctl fails with
+  ``EOVERFLOW`` and the semaphore is not affected. If raising the
+  semaphore's count causes it to become signaled, eligible threads
+  waiting on this semaphore will be woken and the semaphore's count
+  decremented appropriately.
+
+.. c:macro:: NTSYNC_IOC_MUTEX_UNLOCK
+
+  Release a mutex object. Takes a pointer to struct
+  :c:type:`ntsync_mutex_args`, which is used as follows:
+
+  .. list-table::
+
+     * - ``mutex``
+       - Ignored.
+     * - ``owner``
+       - Specifies the owner trying to release this mutex.
+     * - ``count``
+       - On output, contains the previous recursion count.
+
+  If ``owner`` is zero, the ioctl fails with ``EINVAL``. If ``owner``
+  is not the current owner of the mutex, the ioctl fails with
+  ``EPERM``.
+
+  The mutex's count will be decremented by one. If decrementing the
+  mutex's count causes it to become zero, the mutex is marked as
+  unowned and signaled, and eligible threads waiting on it will be
+  woken as appropriate.
+
+.. c:macro:: NTSYNC_IOC_SET_EVENT
+
+  Signal an event object. Takes a pointer to a 32-bit integer, which on
+  output contains the previous state of the event.
+
+  Eligible threads will be woken, and auto-reset events will be
+  designaled appropriately.
+
+.. c:macro:: NTSYNC_IOC_RESET_EVENT
+
+  Designal an event object. Takes a pointer to a 32-bit integer, which
+  on output contains the previous state of the event.
+
+.. c:macro:: NTSYNC_IOC_PULSE_EVENT
+
+  Wake threads waiting on an event object while leaving it in an
+  unsignaled state. Takes a pointer to a 32-bit integer, which on
+  output contains the previous state of the event.
+
+  A pulse operation can be thought of as a set followed by a reset,
+  performed as a single atomic operation. If two threads are waiting on
+  an auto-reset event which is pulsed, only one will be woken. If two
+  threads are waiting a manual-reset event which is pulsed, both will
+  be woken. However, in both cases, the event will be unsignaled
+  afterwards, and a simultaneous read operation will always report the
+  event as unsignaled.
+
+.. c:macro:: NTSYNC_IOC_READ_SEM
+
+  Read the current state of a semaphore object. Takes a pointer to
+  struct :c:type:`ntsync_sem_args`, which is used as follows:
+
+  .. list-table::
+
+     * - ``sem``
+       - Ignored.
+     * - ``count``
+       - On output, contains the current count of the semaphore.
+     * - ``max``
+       - On output, contains the maximum count of the semaphore.
+
+.. c:macro:: NTSYNC_IOC_READ_MUTEX
+
+  Read the current state of a mutex object. Takes a pointer to struct
+  :c:type:`ntsync_mutex_args`, which is used as follows:
+
+  .. list-table::
+
+     * - ``mutex``
+       - Ignored.
+     * - ``owner``
+       - On output, contains the current owner of the mutex, or zero
+         if the mutex is not currently owned.
+     * - ``count``
+       - On output, contains the current recursion count of the mutex.
+
+  If the mutex is marked as abandoned, the function fails with
+  ``EOWNERDEAD``. In this case, ``count`` and ``owner`` are set to
+  zero.
+
+.. c:macro:: NTSYNC_IOC_READ_EVENT
+
+  Read the current state of an event object. Takes a pointer to struct
+  :c:type:`ntsync_event_args`, which is used as follows:
+
+  .. list-table::
+
+     * - ``event``
+       - Ignored.
+     * - ``signaled``
+       - On output, contains the current state of the event.
+     * - ``manual``
+       - On output, contains 1 if the event is a manual-reset event,
+         and 0 otherwise.
+
+.. c:macro:: NTSYNC_IOC_KILL_OWNER
+
+  Mark a mutex as unowned and abandoned if it is owned by the given
+  owner. Takes an input-only pointer to a 32-bit integer denoting the
+  owner. If the owner is zero, the ioctl fails with ``EINVAL``. If the
+  owner does not own the mutex, the function fails with ``EPERM``.
+
+  Eligible threads waiting on the mutex will be woken as appropriate
+  (and such waits will fail with ``EOWNERDEAD``, as described below).
+
+.. c:macro:: NTSYNC_IOC_WAIT_ANY
+
+  Poll on any of a list of objects, atomically acquiring at most one.
+  Takes a pointer to struct :c:type:`ntsync_wait_args`, which is
+  used as follows:
+
+  .. list-table::
+
+     * - ``timeout``
+       - Absolute timeout in nanoseconds. If ``NTSYNC_WAIT_REALTIME``
+         is set, the timeout is measured against the REALTIME clock;
+         otherwise it is measured against the MONOTONIC clock. If the
+         timeout is equal to or earlier than the current time, the
+         function returns immediately without sleeping. If ``timeout``
+         is U64_MAX, the function will sleep until an object is
+         signaled, and will not fail with ``ETIMEDOUT``.
+     * - ``objs``
+       - Pointer to an array of ``count`` file descriptors
+         (specified as an integer so that the structure has the same
+         size regardless of architecture). If any object is
+         invalid, the function fails with ``EINVAL``.
+     * - ``count``
+       - Number of objects specified in the ``objs`` array.
+         If greater than ``NTSYNC_MAX_WAIT_COUNT``, the function fails
+         with ``EINVAL``.
+     * - ``owner``
+       - Mutex owner identifier. If any object in ``objs`` is a mutex,
+         the ioctl will attempt to acquire that mutex on behalf of
+         ``owner``. If ``owner`` is zero, the ioctl fails with
+         ``EINVAL``.
+     * - ``index``
+       - On success, contains the index (into ``objs``) of the object
+         which was signaled. If ``alert`` was signaled instead,
+         this contains ``count``.
+     * - ``alert``
+       - Optional event object file descriptor. If nonzero, this
+         specifies an "alert" event object which, if signaled, will
+         terminate the wait. If nonzero, the identifier must point to a
+         valid event.
+     * - ``flags``
+       - Zero or more flags. Currently the only flag is
+         ``NTSYNC_WAIT_REALTIME``, which causes the timeout to be
+         measured against the REALTIME clock instead of MONOTONIC.
+     * - ``pad``
+       - Unused, must be set to zero.
+
+  This function attempts to acquire one of the given objects. If unable
+  to do so, it sleeps until an object becomes signaled, subsequently
+  acquiring it, or the timeout expires. In the latter case the ioctl
+  fails with ``ETIMEDOUT``. The function only acquires one object, even
+  if multiple objects are signaled.
+
+  A semaphore is considered to be signaled if its count is nonzero, and
+  is acquired by decrementing its count by one. A mutex is considered
+  to be signaled if it is unowned or if its owner matches the ``owner``
+  argument, and is acquired by incrementing its recursion count by one
+  and setting its owner to the ``owner`` argument. An auto-reset event
+  is acquired by designaling it; a manual-reset event is not affected
+  by acquisition.
+
+  Acquisition is atomic and totally ordered with respect to other
+  operations on the same object. If two wait operations (with different
+  ``owner`` identifiers) are queued on the same mutex, only one is
+  signaled. If two wait operations are queued on the same semaphore,
+  and a value of one is posted to it, only one is signaled.
+
+  If an abandoned mutex is acquired, the ioctl fails with
+  ``EOWNERDEAD``. Although this is a failure return, the function may
+  otherwise be considered successful. The mutex is marked as owned by
+  the given owner (with a recursion count of 1) and as no longer
+  abandoned, and ``index`` is still set to the index of the mutex.
+
+  The ``alert`` argument is an "extra" event which can terminate the
+  wait, independently of all other objects.
+
+  It is valid to pass the same object more than once, including by
+  passing the same event in the ``objs`` array and in ``alert``. If a
+  wakeup occurs due to that object being signaled, ``index`` is set to
+  the lowest index corresponding to that object.
+
+  The function may fail with ``EINTR`` if a signal is received.
+
+.. c:macro:: NTSYNC_IOC_WAIT_ALL
+
+  Poll on a list of objects, atomically acquiring all of them. Takes a
+  pointer to struct :c:type:`ntsync_wait_args`, which is used
+  identically to ``NTSYNC_IOC_WAIT_ANY``, except that ``index`` is
+  always filled with zero on success if not woken via alert.
+
+  This function attempts to simultaneously acquire all of the given
+  objects. If unable to do so, it sleeps until all objects become
+  simultaneously signaled, subsequently acquiring them, or the timeout
+  expires. In the latter case the ioctl fails with ``ETIMEDOUT`` and no
+  objects are modified.
+
+  Objects may become signaled and subsequently designaled (through
+  acquisition by other threads) while this thread is sleeping. Only
+  once all objects are simultaneously signaled does the ioctl acquire
+  them and return. The entire acquisition is atomic and totally ordered
+  with respect to other operations on any of the given objects.
+
+  If an abandoned mutex is acquired, the ioctl fails with
+  ``EOWNERDEAD``. Similarly to ``NTSYNC_IOC_WAIT_ANY``, all objects are
+  nevertheless marked as acquired. Note that if multiple mutex objects
+  are specified, there is no way to know which were marked as
+  abandoned.
+
+  As with "any" waits, the ``alert`` argument is an "extra" event which
+  can terminate the wait. Critically, however, an "all" wait will
+  succeed if all members in ``objs`` are signaled, *or* if ``alert`` is
+  signaled. In the latter case ``index`` will be set to ``count``. As
+  with "any" waits, if both conditions are filled, the former takes
+  priority, and objects in ``objs`` will be acquired.
+
+  Unlike ``NTSYNC_IOC_WAIT_ANY``, it is not valid to pass the same
+  object more than once, nor is it valid to pass the same object in
+  ``objs`` and in ``alert``. If this is attempted, the function fails
+  with ``EINVAL``.
diff --git a/MAINTAINERS b/MAINTAINERS
index b27470be2e6a..4112729fc23a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15983,6 +15983,15 @@ T:	git https://github.com/Paragon-Software-Group/linux-ntfs3.git
 F:	Documentation/filesystems/ntfs3.rst
 F:	fs/ntfs3/
 
+NTSYNC SYNCHRONIZATION PRIMITIVE DRIVER
+M:	Elizabeth Figura <zfigura@codeweavers.com>
+L:	wine-devel@winehq.org
+S:	Supported
+F:	Documentation/userspace-api/ntsync.rst
+F:	drivers/misc/ntsync.c
+F:	include/uapi/linux/ntsync.h
+F:	tools/testing/selftests/drivers/ntsync/
+
 NUBUS SUBSYSTEM
 M:	Finn Thain <fthain@linux-m68k.org>
 L:	linux-m68k@lists.linux-m68k.org
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index faf983680040..2907b5c23368 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -507,7 +507,6 @@ config OPEN_DICE
 
 config NTSYNC
 	tristate "NT synchronization primitive emulation"
-	depends on BROKEN
 	help
 	  This module provides kernel support for emulation of Windows NT
 	  synchronization primitives. It is not a hardware driver.
diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c
index 3c2f743c58b0..87a24798a5c7 100644
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -6,11 +6,17 @@
  */
 
 #include <linux/anon_inodes.h>
+#include <linux/atomic.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
 #include <linux/miscdevice.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/overflow.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <uapi/linux/ntsync.h>
@@ -19,6 +25,8 @@
 
 enum ntsync_type {
 	NTSYNC_TYPE_SEM,
+	NTSYNC_TYPE_MUTEX,
+	NTSYNC_TYPE_EVENT,
 };
 
 /*
@@ -30,10 +38,13 @@ enum ntsync_type {
  *
  * Both rely on struct file for reference counting. Individual
  * ntsync_obj objects take a reference to the device when created.
+ * Wait operations take a reference to each object being waited on for
+ * the duration of the wait.
  */
 
 struct ntsync_obj {
 	spinlock_t lock;
+	int dev_locked;
 
 	enum ntsync_type type;
 
@@ -46,13 +57,335 @@ struct ntsync_obj {
 			__u32 count;
 			__u32 max;
 		} sem;
+		struct {
+			__u32 count;
+			pid_t owner;
+			bool ownerdead;
+		} mutex;
+		struct {
+			bool manual;
+			bool signaled;
+		} event;
 	} u;
+
+	/*
+	 * any_waiters is protected by the object lock, but all_waiters is
+	 * protected by the device wait_all_lock.
+	 */
+	struct list_head any_waiters;
+	struct list_head all_waiters;
+
+	/*
+	 * Hint describing how many tasks are queued on this object in a
+	 * wait-all operation.
+	 *
+	 * Any time we do a wake, we may need to wake "all" waiters as well as
+	 * "any" waiters. In order to atomically wake "all" waiters, we must
+	 * lock all of the objects, and that means grabbing the wait_all_lock
+	 * below (and, due to lock ordering rules, before locking this object).
+	 * However, wait-all is a rare operation, and grabbing the wait-all
+	 * lock for every wake would create unnecessary contention.
+	 * Therefore we first check whether all_hint is zero, and, if it is,
+	 * we skip trying to wake "all" waiters.
+	 *
+	 * Since wait requests must originate from user-space threads, we're
+	 * limited here by PID_MAX_LIMIT, so there's no risk of overflow.
+	 */
+	atomic_t all_hint;
+};
+
+struct ntsync_q_entry {
+	struct list_head node;
+	struct ntsync_q *q;
+	struct ntsync_obj *obj;
+	__u32 index;
+};
+
+struct ntsync_q {
+	struct task_struct *task;
+	__u32 owner;
+
+	/*
+	 * Protected via atomic_try_cmpxchg(). Only the thread that wins the
+	 * compare-and-swap may actually change object states and wake this
+	 * task.
+	 */
+	atomic_t signaled;
+
+	bool all;
+	bool ownerdead;
+	__u32 count;
+	struct ntsync_q_entry entries[];
 };
 
 struct ntsync_device {
+	/*
+	 * Wait-all operations must atomically grab all objects, and be totally
+	 * ordered with respect to each other and wait-any operations.
+	 * If one thread is trying to acquire several objects, another thread
+	 * cannot touch the object at the same time.
+	 *
+	 * This device-wide lock is used to serialize wait-for-all
+	 * operations, and operations on an object that is involved in a
+	 * wait-for-all.
+	 */
+	struct mutex wait_all_lock;
+
 	struct file *file;
 };
 
+/*
+ * Single objects are locked using obj->lock.
+ *
+ * Multiple objects are 'locked' while holding dev->wait_all_lock.
+ * In this case however, individual objects are not locked by holding
+ * obj->lock, but by setting obj->dev_locked.
+ *
+ * This means that in order to lock a single object, the sequence is slightly
+ * more complicated than usual. Specifically it needs to check obj->dev_locked
+ * after acquiring obj->lock, if set, it needs to drop the lock and acquire
+ * dev->wait_all_lock in order to serialize against the multi-object operation.
+ */
+
+static void dev_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
+{
+	lockdep_assert_held(&dev->wait_all_lock);
+	lockdep_assert(obj->dev == dev);
+	spin_lock(&obj->lock);
+	/*
+	 * By setting obj->dev_locked inside obj->lock, it is ensured that
+	 * anyone holding obj->lock must see the value.
+	 */
+	obj->dev_locked = 1;
+	spin_unlock(&obj->lock);
+}
+
+static void dev_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
+{
+	lockdep_assert_held(&dev->wait_all_lock);
+	lockdep_assert(obj->dev == dev);
+	spin_lock(&obj->lock);
+	obj->dev_locked = 0;
+	spin_unlock(&obj->lock);
+}
+
+static void obj_lock(struct ntsync_obj *obj)
+{
+	struct ntsync_device *dev = obj->dev;
+
+	for (;;) {
+		spin_lock(&obj->lock);
+		if (likely(!obj->dev_locked))
+			break;
+
+		spin_unlock(&obj->lock);
+		mutex_lock(&dev->wait_all_lock);
+		spin_lock(&obj->lock);
+		/*
+		 * obj->dev_locked should be set and released under the same
+		 * wait_all_lock section, since we now own this lock, it should
+		 * be clear.
+		 */
+		lockdep_assert(!obj->dev_locked);
+		spin_unlock(&obj->lock);
+		mutex_unlock(&dev->wait_all_lock);
+	}
+}
+
+static void obj_unlock(struct ntsync_obj *obj)
+{
+	spin_unlock(&obj->lock);
+}
+
+static bool ntsync_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
+{
+	bool all;
+
+	obj_lock(obj);
+	all = atomic_read(&obj->all_hint);
+	if (unlikely(all)) {
+		obj_unlock(obj);
+		mutex_lock(&dev->wait_all_lock);
+		dev_lock_obj(dev, obj);
+	}
+
+	return all;
+}
+
+static void ntsync_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj, bool all)
+{
+	if (all) {
+		dev_unlock_obj(dev, obj);
+		mutex_unlock(&dev->wait_all_lock);
+	} else {
+		obj_unlock(obj);
+	}
+}
+
+#define ntsync_assert_held(obj) \
+	lockdep_assert((lockdep_is_held(&(obj)->lock) != LOCK_STATE_NOT_HELD) || \
+		       ((lockdep_is_held(&(obj)->dev->wait_all_lock) != LOCK_STATE_NOT_HELD) && \
+			(obj)->dev_locked))
+
+static bool is_signaled(struct ntsync_obj *obj, __u32 owner)
+{
+	ntsync_assert_held(obj);
+
+	switch (obj->type) {
+	case NTSYNC_TYPE_SEM:
+		return !!obj->u.sem.count;
+	case NTSYNC_TYPE_MUTEX:
+		if (obj->u.mutex.owner && obj->u.mutex.owner != owner)
+			return false;
+		return obj->u.mutex.count < UINT_MAX;
+	case NTSYNC_TYPE_EVENT:
+		return obj->u.event.signaled;
+	}
+
+	WARN(1, "bad object type %#x\n", obj->type);
+	return false;
+}
+
+/*
+ * "locked_obj" is an optional pointer to an object which is already locked and
+ * should not be locked again. This is necessary so that changing an object's
+ * state and waking it can be a single atomic operation.
+ */
+static void try_wake_all(struct ntsync_device *dev, struct ntsync_q *q,
+			 struct ntsync_obj *locked_obj)
+{
+	__u32 count = q->count;
+	bool can_wake = true;
+	int signaled = -1;
+	__u32 i;
+
+	lockdep_assert_held(&dev->wait_all_lock);
+	if (locked_obj)
+		lockdep_assert(locked_obj->dev_locked);
+
+	for (i = 0; i < count; i++) {
+		if (q->entries[i].obj != locked_obj)
+			dev_lock_obj(dev, q->entries[i].obj);
+	}
+
+	for (i = 0; i < count; i++) {
+		if (!is_signaled(q->entries[i].obj, q->owner)) {
+			can_wake = false;
+			break;
+		}
+	}
+
+	if (can_wake && atomic_try_cmpxchg(&q->signaled, &signaled, 0)) {
+		for (i = 0; i < count; i++) {
+			struct ntsync_obj *obj = q->entries[i].obj;
+
+			switch (obj->type) {
+			case NTSYNC_TYPE_SEM:
+				obj->u.sem.count--;
+				break;
+			case NTSYNC_TYPE_MUTEX:
+				if (obj->u.mutex.ownerdead)
+					q->ownerdead = true;
+				obj->u.mutex.ownerdead = false;
+				obj->u.mutex.count++;
+				obj->u.mutex.owner = q->owner;
+				break;
+			case NTSYNC_TYPE_EVENT:
+				if (!obj->u.event.manual)
+					obj->u.event.signaled = false;
+				break;
+			}
+		}
+		wake_up_process(q->task);
+	}
+
+	for (i = 0; i < count; i++) {
+		if (q->entries[i].obj != locked_obj)
+			dev_unlock_obj(dev, q->entries[i].obj);
+	}
+}
+
+static void try_wake_all_obj(struct ntsync_device *dev, struct ntsync_obj *obj)
+{
+	struct ntsync_q_entry *entry;
+
+	lockdep_assert_held(&dev->wait_all_lock);
+	lockdep_assert(obj->dev_locked);
+
+	list_for_each_entry(entry, &obj->all_waiters, node)
+		try_wake_all(dev, entry->q, obj);
+}
+
+static void try_wake_any_sem(struct ntsync_obj *sem)
+{
+	struct ntsync_q_entry *entry;
+
+	ntsync_assert_held(sem);
+	lockdep_assert(sem->type == NTSYNC_TYPE_SEM);
+
+	list_for_each_entry(entry, &sem->any_waiters, node) {
+		struct ntsync_q *q = entry->q;
+		int signaled = -1;
+
+		if (!sem->u.sem.count)
+			break;
+
+		if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
+			sem->u.sem.count--;
+			wake_up_process(q->task);
+		}
+	}
+}
+
+static void try_wake_any_mutex(struct ntsync_obj *mutex)
+{
+	struct ntsync_q_entry *entry;
+
+	ntsync_assert_held(mutex);
+	lockdep_assert(mutex->type == NTSYNC_TYPE_MUTEX);
+
+	list_for_each_entry(entry, &mutex->any_waiters, node) {
+		struct ntsync_q *q = entry->q;
+		int signaled = -1;
+
+		if (mutex->u.mutex.count == UINT_MAX)
+			break;
+		if (mutex->u.mutex.owner && mutex->u.mutex.owner != q->owner)
+			continue;
+
+		if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
+			if (mutex->u.mutex.ownerdead)
+				q->ownerdead = true;
+			mutex->u.mutex.ownerdead = false;
+			mutex->u.mutex.count++;
+			mutex->u.mutex.owner = q->owner;
+			wake_up_process(q->task);
+		}
+	}
+}
+
+static void try_wake_any_event(struct ntsync_obj *event)
+{
+	struct ntsync_q_entry *entry;
+
+	ntsync_assert_held(event);
+	lockdep_assert(event->type == NTSYNC_TYPE_EVENT);
+
+	list_for_each_entry(entry, &event->any_waiters, node) {
+		struct ntsync_q *q = entry->q;
+		int signaled = -1;
+
+		if (!event->u.event.signaled)
+			break;
+
+		if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) {
+			if (!event->u.event.manual)
+				event->u.event.signaled = false;
+			wake_up_process(q->task);
+		}
+	}
+}
+
 /*
  * Actually change the semaphore state, returning -EOVERFLOW if it is made
  * invalid.
@@ -61,7 +394,7 @@ static int post_sem_state(struct ntsync_obj *sem, __u32 count)
 {
 	__u32 sum;
 
-	lockdep_assert_held(&sem->lock);
+	ntsync_assert_held(sem);
 
 	if (check_add_overflow(sem->u.sem.count, count, &sum) ||
 	    sum > sem->u.sem.max)
@@ -73,9 +406,11 @@ static int post_sem_state(struct ntsync_obj *sem, __u32 count)
 
 static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
 {
+	struct ntsync_device *dev = sem->dev;
 	__u32 __user *user_args = argp;
 	__u32 prev_count;
 	__u32 args;
+	bool all;
 	int ret;
 
 	if (copy_from_user(&args, argp, sizeof(args)))
@@ -84,12 +419,17 @@ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
 	if (sem->type != NTSYNC_TYPE_SEM)
 		return -EINVAL;
 
-	spin_lock(&sem->lock);
+	all = ntsync_lock_obj(dev, sem);
 
 	prev_count = sem->u.sem.count;
 	ret = post_sem_state(sem, args);
+	if (!ret) {
+		if (all)
+			try_wake_all_obj(dev, sem);
+		try_wake_any_sem(sem);
+	}
 
-	spin_unlock(&sem->lock);
+	ntsync_unlock_obj(dev, sem, all);
 
 	if (!ret && put_user(prev_count, user_args))
 		ret = -EFAULT;
@@ -97,6 +437,226 @@ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
 	return ret;
 }
 
+/*
+ * Actually change the mutex state, returning -EPERM if not the owner.
+ */
+static int unlock_mutex_state(struct ntsync_obj *mutex,
+			      const struct ntsync_mutex_args *args)
+{
+	ntsync_assert_held(mutex);
+
+	if (mutex->u.mutex.owner != args->owner)
+		return -EPERM;
+
+	if (!--mutex->u.mutex.count)
+		mutex->u.mutex.owner = 0;
+	return 0;
+}
+
+static int ntsync_mutex_unlock(struct ntsync_obj *mutex, void __user *argp)
+{
+	struct ntsync_mutex_args __user *user_args = argp;
+	struct ntsync_device *dev = mutex->dev;
+	struct ntsync_mutex_args args;
+	__u32 prev_count;
+	bool all;
+	int ret;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+	if (!args.owner)
+		return -EINVAL;
+
+	if (mutex->type != NTSYNC_TYPE_MUTEX)
+		return -EINVAL;
+
+	all = ntsync_lock_obj(dev, mutex);
+
+	prev_count = mutex->u.mutex.count;
+	ret = unlock_mutex_state(mutex, &args);
+	if (!ret) {
+		if (all)
+			try_wake_all_obj(dev, mutex);
+		try_wake_any_mutex(mutex);
+	}
+
+	ntsync_unlock_obj(dev, mutex, all);
+
+	if (!ret && put_user(prev_count, &user_args->count))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+/*
+ * Actually change the mutex state to mark its owner as dead,
+ * returning -EPERM if not the owner.
+ */
+static int kill_mutex_state(struct ntsync_obj *mutex, __u32 owner)
+{
+	ntsync_assert_held(mutex);
+
+	if (mutex->u.mutex.owner != owner)
+		return -EPERM;
+
+	mutex->u.mutex.ownerdead = true;
+	mutex->u.mutex.owner = 0;
+	mutex->u.mutex.count = 0;
+	return 0;
+}
+
+static int ntsync_mutex_kill(struct ntsync_obj *mutex, void __user *argp)
+{
+	struct ntsync_device *dev = mutex->dev;
+	__u32 owner;
+	bool all;
+	int ret;
+
+	if (get_user(owner, (__u32 __user *)argp))
+		return -EFAULT;
+	if (!owner)
+		return -EINVAL;
+
+	if (mutex->type != NTSYNC_TYPE_MUTEX)
+		return -EINVAL;
+
+	all = ntsync_lock_obj(dev, mutex);
+
+	ret = kill_mutex_state(mutex, owner);
+	if (!ret) {
+		if (all)
+			try_wake_all_obj(dev, mutex);
+		try_wake_any_mutex(mutex);
+	}
+
+	ntsync_unlock_obj(dev, mutex, all);
+
+	return ret;
+}
+
+static int ntsync_event_set(struct ntsync_obj *event, void __user *argp, bool pulse)
+{
+	struct ntsync_device *dev = event->dev;
+	__u32 prev_state;
+	bool all;
+
+	if (event->type != NTSYNC_TYPE_EVENT)
+		return -EINVAL;
+
+	all = ntsync_lock_obj(dev, event);
+
+	prev_state = event->u.event.signaled;
+	event->u.event.signaled = true;
+	if (all)
+		try_wake_all_obj(dev, event);
+	try_wake_any_event(event);
+	if (pulse)
+		event->u.event.signaled = false;
+
+	ntsync_unlock_obj(dev, event, all);
+
+	if (put_user(prev_state, (__u32 __user *)argp))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ntsync_event_reset(struct ntsync_obj *event, void __user *argp)
+{
+	struct ntsync_device *dev = event->dev;
+	__u32 prev_state;
+	bool all;
+
+	if (event->type != NTSYNC_TYPE_EVENT)
+		return -EINVAL;
+
+	all = ntsync_lock_obj(dev, event);
+
+	prev_state = event->u.event.signaled;
+	event->u.event.signaled = false;
+
+	ntsync_unlock_obj(dev, event, all);
+
+	if (put_user(prev_state, (__u32 __user *)argp))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ntsync_sem_read(struct ntsync_obj *sem, void __user *argp)
+{
+	struct ntsync_sem_args __user *user_args = argp;
+	struct ntsync_device *dev = sem->dev;
+	struct ntsync_sem_args args;
+	bool all;
+
+	if (sem->type != NTSYNC_TYPE_SEM)
+		return -EINVAL;
+
+	args.sem = 0;
+
+	all = ntsync_lock_obj(dev, sem);
+
+	args.count = sem->u.sem.count;
+	args.max = sem->u.sem.max;
+
+	ntsync_unlock_obj(dev, sem, all);
+
+	if (copy_to_user(user_args, &args, sizeof(args)))
+		return -EFAULT;
+	return 0;
+}
+
+static int ntsync_mutex_read(struct ntsync_obj *mutex, void __user *argp)
+{
+	struct ntsync_mutex_args __user *user_args = argp;
+	struct ntsync_device *dev = mutex->dev;
+	struct ntsync_mutex_args args;
+	bool all;
+	int ret;
+
+	if (mutex->type != NTSYNC_TYPE_MUTEX)
+		return -EINVAL;
+
+	args.mutex = 0;
+
+	all = ntsync_lock_obj(dev, mutex);
+
+	args.count = mutex->u.mutex.count;
+	args.owner = mutex->u.mutex.owner;
+	ret = mutex->u.mutex.ownerdead ? -EOWNERDEAD : 0;
+
+	ntsync_unlock_obj(dev, mutex, all);
+
+	if (copy_to_user(user_args, &args, sizeof(args)))
+		return -EFAULT;
+	return ret;
+}
+
+static int ntsync_event_read(struct ntsync_obj *event, void __user *argp)
+{
+	struct ntsync_event_args __user *user_args = argp;
+	struct ntsync_device *dev = event->dev;
+	struct ntsync_event_args args;
+	bool all;
+
+	if (event->type != NTSYNC_TYPE_EVENT)
+		return -EINVAL;
+
+	args.event = 0;
+
+	all = ntsync_lock_obj(dev, event);
+
+	args.manual = event->u.event.manual;
+	args.signaled = event->u.event.signaled;
+
+	ntsync_unlock_obj(dev, event, all);
+
+	if (copy_to_user(user_args, &args, sizeof(args)))
+		return -EFAULT;
+	return 0;
+}
+
 static int ntsync_obj_release(struct inode *inode, struct file *file)
 {
 	struct ntsync_obj *obj = file->private_data;
@@ -116,6 +676,22 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd,
 	switch (cmd) {
 	case NTSYNC_IOC_SEM_POST:
 		return ntsync_sem_post(obj, argp);
+	case NTSYNC_IOC_SEM_READ:
+		return ntsync_sem_read(obj, argp);
+	case NTSYNC_IOC_MUTEX_UNLOCK:
+		return ntsync_mutex_unlock(obj, argp);
+	case NTSYNC_IOC_MUTEX_KILL:
+		return ntsync_mutex_kill(obj, argp);
+	case NTSYNC_IOC_MUTEX_READ:
+		return ntsync_mutex_read(obj, argp);
+	case NTSYNC_IOC_EVENT_SET:
+		return ntsync_event_set(obj, argp, false);
+	case NTSYNC_IOC_EVENT_RESET:
+		return ntsync_event_reset(obj, argp);
+	case NTSYNC_IOC_EVENT_PULSE:
+		return ntsync_event_set(obj, argp, true);
+	case NTSYNC_IOC_EVENT_READ:
+		return ntsync_event_read(obj, argp);
 	default:
 		return -ENOIOCTLCMD;
 	}
@@ -141,6 +717,9 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev,
 	obj->dev = dev;
 	get_file(dev->file);
 	spin_lock_init(&obj->lock);
+	INIT_LIST_HEAD(&obj->any_waiters);
+	INIT_LIST_HEAD(&obj->all_waiters);
+	atomic_set(&obj->all_hint, 0);
 
 	return obj;
 }
@@ -191,6 +770,400 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp)
 	return put_user(fd, &user_args->sem);
 }
 
+static int ntsync_create_mutex(struct ntsync_device *dev, void __user *argp)
+{
+	struct ntsync_mutex_args __user *user_args = argp;
+	struct ntsync_mutex_args args;
+	struct ntsync_obj *mutex;
+	int fd;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	if (!args.owner != !args.count)
+		return -EINVAL;
+
+	mutex = ntsync_alloc_obj(dev, NTSYNC_TYPE_MUTEX);
+	if (!mutex)
+		return -ENOMEM;
+	mutex->u.mutex.count = args.count;
+	mutex->u.mutex.owner = args.owner;
+	fd = ntsync_obj_get_fd(mutex);
+	if (fd < 0) {
+		kfree(mutex);
+		return fd;
+	}
+
+	return put_user(fd, &user_args->mutex);
+}
+
+static int ntsync_create_event(struct ntsync_device *dev, void __user *argp)
+{
+	struct ntsync_event_args __user *user_args = argp;
+	struct ntsync_event_args args;
+	struct ntsync_obj *event;
+	int fd;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	event = ntsync_alloc_obj(dev, NTSYNC_TYPE_EVENT);
+	if (!event)
+		return -ENOMEM;
+	event->u.event.manual = args.manual;
+	event->u.event.signaled = args.signaled;
+	fd = ntsync_obj_get_fd(event);
+	if (fd < 0) {
+		kfree(event);
+		return fd;
+	}
+
+	return put_user(fd, &user_args->event);
+}
+
+static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd)
+{
+	struct file *file = fget(fd);
+	struct ntsync_obj *obj;
+
+	if (!file)
+		return NULL;
+
+	if (file->f_op != &ntsync_obj_fops) {
+		fput(file);
+		return NULL;
+	}
+
+	obj = file->private_data;
+	if (obj->dev != dev) {
+		fput(file);
+		return NULL;
+	}
+
+	return obj;
+}
+
+static void put_obj(struct ntsync_obj *obj)
+{
+	fput(obj->file);
+}
+
+static int ntsync_schedule(const struct ntsync_q *q, const struct ntsync_wait_args *args)
+{
+	ktime_t timeout = ns_to_ktime(args->timeout);
+	clockid_t clock = CLOCK_MONOTONIC;
+	ktime_t *timeout_ptr;
+	int ret = 0;
+
+	timeout_ptr = (args->timeout == U64_MAX ? NULL : &timeout);
+
+	if (args->flags & NTSYNC_WAIT_REALTIME)
+		clock = CLOCK_REALTIME;
+
+	do {
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (atomic_read(&q->signaled) != -1) {
+			ret = 0;
+			break;
+		}
+		ret = schedule_hrtimeout_range_clock(timeout_ptr, 0, HRTIMER_MODE_ABS, clock);
+	} while (ret < 0);
+	__set_current_state(TASK_RUNNING);
+
+	return ret;
+}
+
+/*
+ * Allocate and initialize the ntsync_q structure, but do not queue us yet.
+ */
+static int setup_wait(struct ntsync_device *dev,
+		      const struct ntsync_wait_args *args, bool all,
+		      struct ntsync_q **ret_q)
+{
+	int fds[NTSYNC_MAX_WAIT_COUNT + 1];
+	const __u32 count = args->count;
+	struct ntsync_q *q;
+	__u32 total_count;
+	__u32 i, j;
+
+	if (args->pad || (args->flags & ~NTSYNC_WAIT_REALTIME))
+		return -EINVAL;
+
+	if (args->count > NTSYNC_MAX_WAIT_COUNT)
+		return -EINVAL;
+
+	total_count = count;
+	if (args->alert)
+		total_count++;
+
+	if (copy_from_user(fds, u64_to_user_ptr(args->objs),
+			   array_size(count, sizeof(*fds))))
+		return -EFAULT;
+	if (args->alert)
+		fds[count] = args->alert;
+
+	q = kmalloc(struct_size(q, entries, total_count), GFP_KERNEL);
+	if (!q)
+		return -ENOMEM;
+	q->task = current;
+	q->owner = args->owner;
+	atomic_set(&q->signaled, -1);
+	q->all = all;
+	q->ownerdead = false;
+	q->count = count;
+
+	for (i = 0; i < total_count; i++) {
+		struct ntsync_q_entry *entry = &q->entries[i];
+		struct ntsync_obj *obj = get_obj(dev, fds[i]);
+
+		if (!obj)
+			goto err;
+
+		if (all) {
+			/* Check that the objects are all distinct. */
+			for (j = 0; j < i; j++) {
+				if (obj == q->entries[j].obj) {
+					put_obj(obj);
+					goto err;
+				}
+			}
+		}
+
+		entry->obj = obj;
+		entry->q = q;
+		entry->index = i;
+	}
+
+	*ret_q = q;
+	return 0;
+
+err:
+	for (j = 0; j < i; j++)
+		put_obj(q->entries[j].obj);
+	kfree(q);
+	return -EINVAL;
+}
+
+static void try_wake_any_obj(struct ntsync_obj *obj)
+{
+	switch (obj->type) {
+	case NTSYNC_TYPE_SEM:
+		try_wake_any_sem(obj);
+		break;
+	case NTSYNC_TYPE_MUTEX:
+		try_wake_any_mutex(obj);
+		break;
+	case NTSYNC_TYPE_EVENT:
+		try_wake_any_event(obj);
+		break;
+	}
+}
+
+static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp)
+{
+	struct ntsync_wait_args args;
+	__u32 i, total_count;
+	struct ntsync_q *q;
+	int signaled;
+	bool all;
+	int ret;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	ret = setup_wait(dev, &args, false, &q);
+	if (ret < 0)
+		return ret;
+
+	total_count = args.count;
+	if (args.alert)
+		total_count++;
+
+	/* queue ourselves */
+
+	for (i = 0; i < total_count; i++) {
+		struct ntsync_q_entry *entry = &q->entries[i];
+		struct ntsync_obj *obj = entry->obj;
+
+		all = ntsync_lock_obj(dev, obj);
+		list_add_tail(&entry->node, &obj->any_waiters);
+		ntsync_unlock_obj(dev, obj, all);
+	}
+
+	/*
+	 * Check if we are already signaled.
+	 *
+	 * Note that the API requires that normal objects are checked before
+	 * the alert event. Hence we queue the alert event last, and check
+	 * objects in order.
+	 */
+
+	for (i = 0; i < total_count; i++) {
+		struct ntsync_obj *obj = q->entries[i].obj;
+
+		if (atomic_read(&q->signaled) != -1)
+			break;
+
+		all = ntsync_lock_obj(dev, obj);
+		try_wake_any_obj(obj);
+		ntsync_unlock_obj(dev, obj, all);
+	}
+
+	/* sleep */
+
+	ret = ntsync_schedule(q, &args);
+
+	/* and finally, unqueue */
+
+	for (i = 0; i < total_count; i++) {
+		struct ntsync_q_entry *entry = &q->entries[i];
+		struct ntsync_obj *obj = entry->obj;
+
+		all = ntsync_lock_obj(dev, obj);
+		list_del(&entry->node);
+		ntsync_unlock_obj(dev, obj, all);
+
+		put_obj(obj);
+	}
+
+	signaled = atomic_read(&q->signaled);
+	if (signaled != -1) {
+		struct ntsync_wait_args __user *user_args = argp;
+
+		/* even if we caught a signal, we need to communicate success */
+		ret = q->ownerdead ? -EOWNERDEAD : 0;
+
+		if (put_user(signaled, &user_args->index))
+			ret = -EFAULT;
+	} else if (!ret) {
+		ret = -ETIMEDOUT;
+	}
+
+	kfree(q);
+	return ret;
+}
+
+static int ntsync_wait_all(struct ntsync_device *dev, void __user *argp)
+{
+	struct ntsync_wait_args args;
+	struct ntsync_q *q;
+	int signaled;
+	__u32 i;
+	int ret;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	ret = setup_wait(dev, &args, true, &q);
+	if (ret < 0)
+		return ret;
+
+	/* queue ourselves */
+
+	mutex_lock(&dev->wait_all_lock);
+
+	for (i = 0; i < args.count; i++) {
+		struct ntsync_q_entry *entry = &q->entries[i];
+		struct ntsync_obj *obj = entry->obj;
+
+		atomic_inc(&obj->all_hint);
+
+		/*
+		 * obj->all_waiters is protected by dev->wait_all_lock rather
+		 * than obj->lock, so there is no need to acquire obj->lock
+		 * here.
+		 */
+		list_add_tail(&entry->node, &obj->all_waiters);
+	}
+	if (args.alert) {
+		struct ntsync_q_entry *entry = &q->entries[args.count];
+		struct ntsync_obj *obj = entry->obj;
+
+		dev_lock_obj(dev, obj);
+		list_add_tail(&entry->node, &obj->any_waiters);
+		dev_unlock_obj(dev, obj);
+	}
+
+	/* check if we are already signaled */
+
+	try_wake_all(dev, q, NULL);
+
+	mutex_unlock(&dev->wait_all_lock);
+
+	/*
+	 * Check if the alert event is signaled, making sure to do so only
+	 * after checking if the other objects are signaled.
+	 */
+
+	if (args.alert) {
+		struct ntsync_obj *obj = q->entries[args.count].obj;
+
+		if (atomic_read(&q->signaled) == -1) {
+			bool all = ntsync_lock_obj(dev, obj);
+			try_wake_any_obj(obj);
+			ntsync_unlock_obj(dev, obj, all);
+		}
+	}
+
+	/* sleep */
+
+	ret = ntsync_schedule(q, &args);
+
+	/* and finally, unqueue */
+
+	mutex_lock(&dev->wait_all_lock);
+
+	for (i = 0; i < args.count; i++) {
+		struct ntsync_q_entry *entry = &q->entries[i];
+		struct ntsync_obj *obj = entry->obj;
+
+		/*
+		 * obj->all_waiters is protected by dev->wait_all_lock rather
+		 * than obj->lock, so there is no need to acquire it here.
+		 */
+		list_del(&entry->node);
+
+		atomic_dec(&obj->all_hint);
+
+		put_obj(obj);
+	}
+
+	mutex_unlock(&dev->wait_all_lock);
+
+	if (args.alert) {
+		struct ntsync_q_entry *entry = &q->entries[args.count];
+		struct ntsync_obj *obj = entry->obj;
+		bool all;
+
+		all = ntsync_lock_obj(dev, obj);
+		list_del(&entry->node);
+		ntsync_unlock_obj(dev, obj, all);
+
+		put_obj(obj);
+	}
+
+	signaled = atomic_read(&q->signaled);
+	if (signaled != -1) {
+		struct ntsync_wait_args __user *user_args = argp;
+
+		/* even if we caught a signal, we need to communicate success */
+		ret = q->ownerdead ? -EOWNERDEAD : 0;
+
+		if (put_user(signaled, &user_args->index))
+			ret = -EFAULT;
+	} else if (!ret) {
+		ret = -ETIMEDOUT;
+	}
+
+	kfree(q);
+	return ret;
+}
+
 static int ntsync_char_open(struct inode *inode, struct file *file)
 {
 	struct ntsync_device *dev;
@@ -199,6 +1172,8 @@ static int ntsync_char_open(struct inode *inode, struct file *file)
 	if (!dev)
 		return -ENOMEM;
 
+	mutex_init(&dev->wait_all_lock);
+
 	file->private_data = dev;
 	dev->file = file;
 	return nonseekable_open(inode, file);
@@ -220,8 +1195,16 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd,
 	void __user *argp = (void __user *)parm;
 
 	switch (cmd) {
+	case NTSYNC_IOC_CREATE_EVENT:
+		return ntsync_create_event(dev, argp);
+	case NTSYNC_IOC_CREATE_MUTEX:
+		return ntsync_create_mutex(dev, argp);
 	case NTSYNC_IOC_CREATE_SEM:
 		return ntsync_create_sem(dev, argp);
+	case NTSYNC_IOC_WAIT_ALL:
+		return ntsync_wait_all(dev, argp);
+	case NTSYNC_IOC_WAIT_ANY:
+		return ntsync_wait_any(dev, argp);
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h
index dcfa38fdc93c..4a8095a3fc34 100644
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -16,8 +16,47 @@ struct ntsync_sem_args {
 	__u32 max;
 };
 
+struct ntsync_mutex_args {
+	__u32 mutex;
+	__u32 owner;
+	__u32 count;
+};
+
+struct ntsync_event_args {
+	__u32 event;
+	__u32 manual;
+	__u32 signaled;
+};
+
+#define NTSYNC_WAIT_REALTIME	0x1
+
+struct ntsync_wait_args {
+	__u64 timeout;
+	__u64 objs;
+	__u32 count;
+	__u32 index;
+	__u32 flags;
+	__u32 owner;
+	__u32 alert;
+	__u32 pad;
+};
+
+#define NTSYNC_MAX_WAIT_COUNT 64
+
 #define NTSYNC_IOC_CREATE_SEM		_IOWR('N', 0x80, struct ntsync_sem_args)
+#define NTSYNC_IOC_WAIT_ANY		_IOWR('N', 0x82, struct ntsync_wait_args)
+#define NTSYNC_IOC_WAIT_ALL		_IOWR('N', 0x83, struct ntsync_wait_args)
+#define NTSYNC_IOC_CREATE_MUTEX		_IOWR('N', 0x84, struct ntsync_sem_args)
+#define NTSYNC_IOC_CREATE_EVENT		_IOWR('N', 0x87, struct ntsync_event_args)
 
 #define NTSYNC_IOC_SEM_POST		_IOWR('N', 0x81, __u32)
+#define NTSYNC_IOC_MUTEX_UNLOCK		_IOWR('N', 0x85, struct ntsync_mutex_args)
+#define NTSYNC_IOC_MUTEX_KILL		_IOW ('N', 0x86, __u32)
+#define NTSYNC_IOC_EVENT_SET		_IOR ('N', 0x88, __u32)
+#define NTSYNC_IOC_EVENT_RESET		_IOR ('N', 0x89, __u32)
+#define NTSYNC_IOC_EVENT_PULSE		_IOR ('N', 0x8a, __u32)
+#define NTSYNC_IOC_SEM_READ		_IOR ('N', 0x8b, struct ntsync_sem_args)
+#define NTSYNC_IOC_MUTEX_READ		_IOR ('N', 0x8c, struct ntsync_mutex_args)
+#define NTSYNC_IOC_EVENT_READ		_IOR ('N', 0x8d, struct ntsync_event_args)
 
 #endif
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 9039f3709aff..d5aeaa8fe3ca 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -16,6 +16,7 @@ TARGETS += damon
 TARGETS += devices
 TARGETS += dmabuf-heaps
 TARGETS += drivers/dma-buf
+TARGETS += drivers/ntsync
 TARGETS += drivers/s390x/uvdevice
 TARGETS += drivers/net
 TARGETS += drivers/net/bonding
diff --git a/tools/testing/selftests/drivers/ntsync/.gitignore b/tools/testing/selftests/drivers/ntsync/.gitignore
new file mode 100644
index 000000000000..848573a3d3ea
--- /dev/null
+++ b/tools/testing/selftests/drivers/ntsync/.gitignore
@@ -0,0 +1 @@
+ntsync
diff --git a/tools/testing/selftests/drivers/ntsync/Makefile b/tools/testing/selftests/drivers/ntsync/Makefile
new file mode 100644
index 000000000000..dbf2b055c0b2
--- /dev/null
+++ b/tools/testing/selftests/drivers/ntsync/Makefile
@@ -0,0 +1,7 @@
+# SPDX-LICENSE-IDENTIFIER: GPL-2.0-only
+TEST_GEN_PROGS := ntsync
+
+CFLAGS += $(KHDR_INCLUDES)
+LDLIBS += -lpthread
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/drivers/ntsync/config b/tools/testing/selftests/drivers/ntsync/config
new file mode 100644
index 000000000000..60539c826d06
--- /dev/null
+++ b/tools/testing/selftests/drivers/ntsync/config
@@ -0,0 +1 @@
+CONFIG_WINESYNC=y
diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c
new file mode 100644
index 000000000000..5fa2c9a0768c
--- /dev/null
+++ b/tools/testing/selftests/drivers/ntsync/ntsync.c
@@ -0,0 +1,1407 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Various unit tests for the "ntsync" synchronization primitive driver.
+ *
+ * Copyright (C) 2021-2022 Elizabeth Figura <zfigura@codeweavers.com>
+ */
+
+#define _GNU_SOURCE
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <pthread.h>
+#include <linux/ntsync.h>
+#include "../../kselftest_harness.h"
+
+static int read_sem_state(int sem, __u32 *count, __u32 *max)
+{
+	struct ntsync_sem_args args;
+	int ret;
+
+	memset(&args, 0xcc, sizeof(args));
+	ret = ioctl(sem, NTSYNC_IOC_SEM_READ, &args);
+	*count = args.count;
+	*max = args.max;
+	return ret;
+}
+
+#define check_sem_state(sem, count, max) \
+	({ \
+		__u32 __count, __max; \
+		int ret = read_sem_state((sem), &__count, &__max); \
+		EXPECT_EQ(0, ret); \
+		EXPECT_EQ((count), __count); \
+		EXPECT_EQ((max), __max); \
+	})
+
+static int post_sem(int sem, __u32 *count)
+{
+	return ioctl(sem, NTSYNC_IOC_SEM_POST, count);
+}
+
+static int read_mutex_state(int mutex, __u32 *count, __u32 *owner)
+{
+	struct ntsync_mutex_args args;
+	int ret;
+
+	memset(&args, 0xcc, sizeof(args));
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &args);
+	*count = args.count;
+	*owner = args.owner;
+	return ret;
+}
+
+#define check_mutex_state(mutex, count, owner) \
+	({ \
+		__u32 __count, __owner; \
+		int ret = read_mutex_state((mutex), &__count, &__owner); \
+		EXPECT_EQ(0, ret); \
+		EXPECT_EQ((count), __count); \
+		EXPECT_EQ((owner), __owner); \
+	})
+
+static int unlock_mutex(int mutex, __u32 owner, __u32 *count)
+{
+	struct ntsync_mutex_args args;
+	int ret;
+
+	args.owner = owner;
+	args.count = 0xdeadbeef;
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_UNLOCK, &args);
+	*count = args.count;
+	return ret;
+}
+
+static int read_event_state(int event, __u32 *signaled, __u32 *manual)
+{
+	struct ntsync_event_args args;
+	int ret;
+
+	memset(&args, 0xcc, sizeof(args));
+	ret = ioctl(event, NTSYNC_IOC_EVENT_READ, &args);
+	*signaled = args.signaled;
+	*manual = args.manual;
+	return ret;
+}
+
+#define check_event_state(event, signaled, manual) \
+	({ \
+		__u32 __signaled, __manual; \
+		int ret = read_event_state((event), &__signaled, &__manual); \
+		EXPECT_EQ(0, ret); \
+		EXPECT_EQ((signaled), __signaled); \
+		EXPECT_EQ((manual), __manual); \
+	})
+
+static int wait_objs(int fd, unsigned long request, __u32 count,
+		     const int *objs, __u32 owner, int alert, __u32 *index)
+{
+	struct ntsync_wait_args args = {0};
+	struct timespec timeout;
+	int ret;
+
+	clock_gettime(CLOCK_MONOTONIC, &timeout);
+
+	args.timeout = timeout.tv_sec * 1000000000 + timeout.tv_nsec;
+	args.count = count;
+	args.objs = (uintptr_t)objs;
+	args.owner = owner;
+	args.index = 0xdeadbeef;
+	args.alert = alert;
+	ret = ioctl(fd, request, &args);
+	*index = args.index;
+	return ret;
+}
+
+static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index)
+{
+	return wait_objs(fd, NTSYNC_IOC_WAIT_ANY, count, objs, owner, 0, index);
+}
+
+static int wait_all(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index)
+{
+	return wait_objs(fd, NTSYNC_IOC_WAIT_ALL, count, objs, owner, 0, index);
+}
+
+static int wait_any_alert(int fd, __u32 count, const int *objs,
+			  __u32 owner, int alert, __u32 *index)
+{
+	return wait_objs(fd, NTSYNC_IOC_WAIT_ANY,
+			 count, objs, owner, alert, index);
+}
+
+static int wait_all_alert(int fd, __u32 count, const int *objs,
+			  __u32 owner, int alert, __u32 *index)
+{
+	return wait_objs(fd, NTSYNC_IOC_WAIT_ALL,
+			 count, objs, owner, alert, index);
+}
+
+TEST(semaphore_state)
+{
+	struct ntsync_sem_args sem_args;
+	struct timespec timeout;
+	__u32 count, index;
+	int fd, ret, sem;
+
+	clock_gettime(CLOCK_MONOTONIC, &timeout);
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	sem_args.count = 3;
+	sem_args.max = 2;
+	sem_args.sem = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+
+	sem_args.count = 2;
+	sem_args.max = 2;
+	sem_args.sem = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, sem_args.sem);
+	sem = sem_args.sem;
+	check_sem_state(sem, 2, 2);
+
+	count = 0;
+	ret = post_sem(sem, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, count);
+	check_sem_state(sem, 2, 2);
+
+	count = 1;
+	ret = post_sem(sem, &count);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOVERFLOW, errno);
+	check_sem_state(sem, 2, 2);
+
+	ret = wait_any(fd, 1, &sem, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(sem, 1, 2);
+
+	ret = wait_any(fd, 1, &sem, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(sem, 0, 2);
+
+	ret = wait_any(fd, 1, &sem, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	count = 3;
+	ret = post_sem(sem, &count);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOVERFLOW, errno);
+	check_sem_state(sem, 0, 2);
+
+	count = 2;
+	ret = post_sem(sem, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+	check_sem_state(sem, 2, 2);
+
+	ret = wait_any(fd, 1, &sem, 123, &index);
+	EXPECT_EQ(0, ret);
+	ret = wait_any(fd, 1, &sem, 123, &index);
+	EXPECT_EQ(0, ret);
+
+	count = 1;
+	ret = post_sem(sem, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+	check_sem_state(sem, 1, 2);
+
+	count = ~0u;
+	ret = post_sem(sem, &count);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOVERFLOW, errno);
+	check_sem_state(sem, 1, 2);
+
+	close(sem);
+
+	close(fd);
+}
+
+TEST(mutex_state)
+{
+	struct ntsync_mutex_args mutex_args;
+	__u32 owner, count, index;
+	struct timespec timeout;
+	int fd, ret, mutex;
+
+	clock_gettime(CLOCK_MONOTONIC, &timeout);
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	mutex_args.owner = 123;
+	mutex_args.count = 0;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+
+	mutex_args.owner = 0;
+	mutex_args.count = 2;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+
+	mutex_args.owner = 123;
+	mutex_args.count = 2;
+	mutex_args.mutex = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, mutex_args.mutex);
+	mutex = mutex_args.mutex;
+	check_mutex_state(mutex, 2, 123);
+
+	ret = unlock_mutex(mutex, 0, &count);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+
+	ret = unlock_mutex(mutex, 456, &count);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EPERM, errno);
+	check_mutex_state(mutex, 2, 123);
+
+	ret = unlock_mutex(mutex, 123, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, count);
+	check_mutex_state(mutex, 1, 123);
+
+	ret = unlock_mutex(mutex, 123, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, count);
+	check_mutex_state(mutex, 0, 0);
+
+	ret = unlock_mutex(mutex, 123, &count);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EPERM, errno);
+
+	ret = wait_any(fd, 1, &mutex, 456, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_mutex_state(mutex, 1, 456);
+
+	ret = wait_any(fd, 1, &mutex, 456, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_mutex_state(mutex, 2, 456);
+
+	ret = unlock_mutex(mutex, 456, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, count);
+	check_mutex_state(mutex, 1, 456);
+
+	ret = wait_any(fd, 1, &mutex, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	owner = 0;
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+
+	owner = 123;
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EPERM, errno);
+	check_mutex_state(mutex, 1, 456);
+
+	owner = 456;
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
+	EXPECT_EQ(0, ret);
+
+	memset(&mutex_args, 0xcc, sizeof(mutex_args));
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOWNERDEAD, errno);
+	EXPECT_EQ(0, mutex_args.count);
+	EXPECT_EQ(0, mutex_args.owner);
+
+	memset(&mutex_args, 0xcc, sizeof(mutex_args));
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOWNERDEAD, errno);
+	EXPECT_EQ(0, mutex_args.count);
+	EXPECT_EQ(0, mutex_args.owner);
+
+	ret = wait_any(fd, 1, &mutex, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOWNERDEAD, errno);
+	EXPECT_EQ(0, index);
+	check_mutex_state(mutex, 1, 123);
+
+	owner = 123;
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
+	EXPECT_EQ(0, ret);
+
+	memset(&mutex_args, 0xcc, sizeof(mutex_args));
+	ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOWNERDEAD, errno);
+	EXPECT_EQ(0, mutex_args.count);
+	EXPECT_EQ(0, mutex_args.owner);
+
+	ret = wait_any(fd, 1, &mutex, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOWNERDEAD, errno);
+	EXPECT_EQ(0, index);
+	check_mutex_state(mutex, 1, 123);
+
+	close(mutex);
+
+	mutex_args.owner = 0;
+	mutex_args.count = 0;
+	mutex_args.mutex = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, mutex_args.mutex);
+	mutex = mutex_args.mutex;
+	check_mutex_state(mutex, 0, 0);
+
+	ret = wait_any(fd, 1, &mutex, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_mutex_state(mutex, 1, 123);
+
+	close(mutex);
+
+	mutex_args.owner = 123;
+	mutex_args.count = ~0u;
+	mutex_args.mutex = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, mutex_args.mutex);
+	mutex = mutex_args.mutex;
+	check_mutex_state(mutex, ~0u, 123);
+
+	ret = wait_any(fd, 1, &mutex, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	close(mutex);
+
+	close(fd);
+}
+
+TEST(manual_event_state)
+{
+	struct ntsync_event_args event_args;
+	__u32 index, signaled;
+	int fd, event, ret;
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	event_args.manual = 1;
+	event_args.signaled = 0;
+	event_args.event = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, event_args.event);
+	event = event_args.event;
+	check_event_state(event, 0, 1);
+
+	signaled = 0xdeadbeef;
+	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(event, 1, 1);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+	check_event_state(event, 1, 1);
+
+	ret = wait_any(fd, 1, &event, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_event_state(event, 1, 1);
+
+	signaled = 0xdeadbeef;
+	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+	check_event_state(event, 0, 1);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(event, 0, 1);
+
+	ret = wait_any(fd, 1, &event, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+	check_event_state(event, 0, 1);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(event, 0, 1);
+
+	close(event);
+
+	close(fd);
+}
+
+TEST(auto_event_state)
+{
+	struct ntsync_event_args event_args;
+	__u32 index, signaled;
+	int fd, event, ret;
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	event_args.manual = 0;
+	event_args.signaled = 1;
+	event_args.event = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, event_args.event);
+	event = event_args.event;
+
+	check_event_state(event, 1, 0);
+
+	signaled = 0xdeadbeef;
+	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+	check_event_state(event, 1, 0);
+
+	ret = wait_any(fd, 1, &event, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_event_state(event, 0, 0);
+
+	signaled = 0xdeadbeef;
+	ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(event, 0, 0);
+
+	ret = wait_any(fd, 1, &event, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+	check_event_state(event, 0, 0);
+
+	ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(event, 0, 0);
+
+	close(event);
+
+	close(fd);
+}
+
+TEST(test_wait_any)
+{
+	int objs[NTSYNC_MAX_WAIT_COUNT + 1], fd, ret;
+	struct ntsync_mutex_args mutex_args = {0};
+	struct ntsync_sem_args sem_args = {0};
+	__u32 owner, index, count, i;
+	struct timespec timeout;
+
+	clock_gettime(CLOCK_MONOTONIC, &timeout);
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	sem_args.count = 2;
+	sem_args.max = 3;
+	sem_args.sem = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, sem_args.sem);
+
+	mutex_args.owner = 0;
+	mutex_args.count = 0;
+	mutex_args.mutex = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, mutex_args.mutex);
+
+	objs[0] = sem_args.sem;
+	objs[1] = mutex_args.mutex;
+
+	ret = wait_any(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(sem_args.sem, 1, 3);
+	check_mutex_state(mutex_args.mutex, 0, 0);
+
+	ret = wait_any(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(sem_args.sem, 0, 3);
+	check_mutex_state(mutex_args.mutex, 0, 0);
+
+	ret = wait_any(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, index);
+	check_sem_state(sem_args.sem, 0, 3);
+	check_mutex_state(mutex_args.mutex, 1, 123);
+
+	count = 1;
+	ret = post_sem(sem_args.sem, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+
+	ret = wait_any(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(sem_args.sem, 0, 3);
+	check_mutex_state(mutex_args.mutex, 1, 123);
+
+	ret = wait_any(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, index);
+	check_sem_state(sem_args.sem, 0, 3);
+	check_mutex_state(mutex_args.mutex, 2, 123);
+
+	ret = wait_any(fd, 2, objs, 456, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	owner = 123;
+	ret = ioctl(mutex_args.mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_any(fd, 2, objs, 456, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOWNERDEAD, errno);
+	EXPECT_EQ(1, index);
+
+	ret = wait_any(fd, 2, objs, 456, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, index);
+
+	/* test waiting on the same object twice */
+	count = 2;
+	ret = post_sem(sem_args.sem, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+
+	objs[0] = objs[1] = sem_args.sem;
+	ret = wait_any(fd, 2, objs, 456, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(sem_args.sem, 1, 3);
+
+	ret = wait_any(fd, 0, NULL, 456, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	for (i = 0; i < NTSYNC_MAX_WAIT_COUNT + 1; ++i)
+		objs[i] = sem_args.sem;
+
+	ret = wait_any(fd, NTSYNC_MAX_WAIT_COUNT, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+
+	ret = wait_any(fd, NTSYNC_MAX_WAIT_COUNT + 1, objs, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+
+	ret = wait_any(fd, -1, objs, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+
+	close(sem_args.sem);
+	close(mutex_args.mutex);
+
+	close(fd);
+}
+
+TEST(test_wait_all)
+{
+	struct ntsync_event_args event_args = {0};
+	struct ntsync_mutex_args mutex_args = {0};
+	struct ntsync_sem_args sem_args = {0};
+	__u32 owner, index, count;
+	int objs[2], fd, ret;
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	sem_args.count = 2;
+	sem_args.max = 3;
+	sem_args.sem = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, sem_args.sem);
+
+	mutex_args.owner = 0;
+	mutex_args.count = 0;
+	mutex_args.mutex = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, mutex_args.mutex);
+
+	event_args.manual = true;
+	event_args.signaled = true;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_EQ(0, ret);
+
+	objs[0] = sem_args.sem;
+	objs[1] = mutex_args.mutex;
+
+	ret = wait_all(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(sem_args.sem, 1, 3);
+	check_mutex_state(mutex_args.mutex, 1, 123);
+
+	ret = wait_all(fd, 2, objs, 456, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+	check_sem_state(sem_args.sem, 1, 3);
+	check_mutex_state(mutex_args.mutex, 1, 123);
+
+	ret = wait_all(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(sem_args.sem, 0, 3);
+	check_mutex_state(mutex_args.mutex, 2, 123);
+
+	ret = wait_all(fd, 2, objs, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+	check_sem_state(sem_args.sem, 0, 3);
+	check_mutex_state(mutex_args.mutex, 2, 123);
+
+	count = 3;
+	ret = post_sem(sem_args.sem, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+
+	ret = wait_all(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(sem_args.sem, 2, 3);
+	check_mutex_state(mutex_args.mutex, 3, 123);
+
+	owner = 123;
+	ret = ioctl(mutex_args.mutex, NTSYNC_IOC_MUTEX_KILL, &owner);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_all(fd, 2, objs, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EOWNERDEAD, errno);
+	check_sem_state(sem_args.sem, 1, 3);
+	check_mutex_state(mutex_args.mutex, 1, 123);
+
+	objs[0] = sem_args.sem;
+	objs[1] = event_args.event;
+	ret = wait_all(fd, 2, objs, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+	check_sem_state(sem_args.sem, 0, 3);
+	check_event_state(event_args.event, 1, 1);
+
+	/* test waiting on the same object twice */
+	objs[0] = objs[1] = sem_args.sem;
+	ret = wait_all(fd, 2, objs, 123, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(EINVAL, errno);
+
+	close(sem_args.sem);
+	close(mutex_args.mutex);
+	close(event_args.event);
+
+	close(fd);
+}
+
+struct wake_args {
+	int fd;
+	int obj;
+};
+
+struct wait_args {
+	int fd;
+	unsigned long request;
+	struct ntsync_wait_args *args;
+	int ret;
+	int err;
+};
+
+static void *wait_thread(void *arg)
+{
+	struct wait_args *args = arg;
+
+	args->ret = ioctl(args->fd, args->request, args->args);
+	args->err = errno;
+	return NULL;
+}
+
+static __u64 get_abs_timeout(unsigned int ms)
+{
+	struct timespec timeout;
+	clock_gettime(CLOCK_MONOTONIC, &timeout);
+	return (timeout.tv_sec * 1000000000) + timeout.tv_nsec + (ms * 1000000);
+}
+
+static int wait_for_thread(pthread_t thread, unsigned int ms)
+{
+	struct timespec timeout;
+
+	clock_gettime(CLOCK_REALTIME, &timeout);
+	timeout.tv_nsec += ms * 1000000;
+	timeout.tv_sec += (timeout.tv_nsec / 1000000000);
+	timeout.tv_nsec %= 1000000000;
+	return pthread_timedjoin_np(thread, NULL, &timeout);
+}
+
+TEST(wake_any)
+{
+	struct ntsync_event_args event_args = {0};
+	struct ntsync_mutex_args mutex_args = {0};
+	struct ntsync_wait_args wait_args = {0};
+	struct ntsync_sem_args sem_args = {0};
+	struct wait_args thread_args;
+	__u32 count, index, signaled;
+	int objs[2], fd, ret;
+	pthread_t thread;
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	sem_args.count = 0;
+	sem_args.max = 3;
+	sem_args.sem = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, sem_args.sem);
+
+	mutex_args.owner = 123;
+	mutex_args.count = 1;
+	mutex_args.mutex = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, mutex_args.mutex);
+
+	objs[0] = sem_args.sem;
+	objs[1] = mutex_args.mutex;
+
+	/* test waking the semaphore */
+
+	wait_args.timeout = get_abs_timeout(1000);
+	wait_args.objs = (uintptr_t)objs;
+	wait_args.count = 2;
+	wait_args.owner = 456;
+	wait_args.index = 0xdeadbeef;
+	thread_args.fd = fd;
+	thread_args.args = &wait_args;
+	thread_args.request = NTSYNC_IOC_WAIT_ANY;
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	count = 1;
+	ret = post_sem(sem_args.sem, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+	check_sem_state(sem_args.sem, 0, 3);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(0, wait_args.index);
+
+	/* test waking the mutex */
+
+	/* first grab it again for owner 123 */
+	ret = wait_any(fd, 1, &mutex_args.mutex, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+
+	wait_args.timeout = get_abs_timeout(1000);
+	wait_args.owner = 456;
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	ret = unlock_mutex(mutex_args.mutex, 123, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, count);
+
+	ret = pthread_tryjoin_np(thread, NULL);
+	EXPECT_EQ(EBUSY, ret);
+
+	ret = unlock_mutex(mutex_args.mutex, 123, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, mutex_args.count);
+	check_mutex_state(mutex_args.mutex, 1, 456);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(1, wait_args.index);
+
+	/* test waking events */
+
+	event_args.manual = false;
+	event_args.signaled = false;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_EQ(0, ret);
+
+	objs[1] = event_args.event;
+	wait_args.timeout = get_abs_timeout(1000);
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(event_args.event, 0, 0);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(1, wait_args.index);
+
+	wait_args.timeout = get_abs_timeout(1000);
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_PULSE, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(event_args.event, 0, 0);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(1, wait_args.index);
+
+	close(event_args.event);
+
+	event_args.manual = true;
+	event_args.signaled = false;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_EQ(0, ret);
+
+	objs[1] = event_args.event;
+	wait_args.timeout = get_abs_timeout(1000);
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(event_args.event, 1, 1);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(1, wait_args.index);
+
+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+
+	wait_args.timeout = get_abs_timeout(1000);
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_PULSE, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+	check_event_state(event_args.event, 0, 1);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(1, wait_args.index);
+
+	close(event_args.event);
+
+	/* delete an object while it's being waited on */
+
+	wait_args.timeout = get_abs_timeout(200);
+	wait_args.owner = 123;
+	objs[1] = mutex_args.mutex;
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	close(sem_args.sem);
+	close(mutex_args.mutex);
+
+	ret = wait_for_thread(thread, 200);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(-1, thread_args.ret);
+	EXPECT_EQ(ETIMEDOUT, thread_args.err);
+
+	close(fd);
+}
+
+TEST(wake_all)
+{
+	struct ntsync_event_args manual_event_args = {0};
+	struct ntsync_event_args auto_event_args = {0};
+	struct ntsync_mutex_args mutex_args = {0};
+	struct ntsync_wait_args wait_args = {0};
+	struct ntsync_sem_args sem_args = {0};
+	struct wait_args thread_args;
+	__u32 count, index, signaled;
+	int objs[4], fd, ret;
+	pthread_t thread;
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	sem_args.count = 0;
+	sem_args.max = 3;
+	sem_args.sem = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, sem_args.sem);
+
+	mutex_args.owner = 123;
+	mutex_args.count = 1;
+	mutex_args.mutex = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, mutex_args.mutex);
+
+	manual_event_args.manual = true;
+	manual_event_args.signaled = true;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &manual_event_args);
+	EXPECT_EQ(0, ret);
+
+	auto_event_args.manual = false;
+	auto_event_args.signaled = true;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &auto_event_args);
+	EXPECT_EQ(0, ret);
+
+	objs[0] = sem_args.sem;
+	objs[1] = mutex_args.mutex;
+	objs[2] = manual_event_args.event;
+	objs[3] = auto_event_args.event;
+
+	wait_args.timeout = get_abs_timeout(1000);
+	wait_args.objs = (uintptr_t)objs;
+	wait_args.count = 4;
+	wait_args.owner = 456;
+	thread_args.fd = fd;
+	thread_args.args = &wait_args;
+	thread_args.request = NTSYNC_IOC_WAIT_ALL;
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	count = 1;
+	ret = post_sem(sem_args.sem, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+
+	ret = pthread_tryjoin_np(thread, NULL);
+	EXPECT_EQ(EBUSY, ret);
+
+	check_sem_state(sem_args.sem, 1, 3);
+
+	ret = wait_any(fd, 1, &sem_args.sem, 123, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+
+	ret = unlock_mutex(mutex_args.mutex, 123, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, count);
+
+	ret = pthread_tryjoin_np(thread, NULL);
+	EXPECT_EQ(EBUSY, ret);
+
+	check_mutex_state(mutex_args.mutex, 0, 0);
+
+	ret = ioctl(manual_event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+
+	count = 2;
+	ret = post_sem(sem_args.sem, &count);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, count);
+	check_sem_state(sem_args.sem, 2, 3);
+
+	ret = ioctl(auto_event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, signaled);
+
+	ret = ioctl(manual_event_args.event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+
+	ret = ioctl(auto_event_args.event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, signaled);
+
+	check_sem_state(sem_args.sem, 1, 3);
+	check_mutex_state(mutex_args.mutex, 1, 456);
+	check_event_state(manual_event_args.event, 1, 1);
+	check_event_state(auto_event_args.event, 0, 0);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+
+	/* delete an object while it's being waited on */
+
+	wait_args.timeout = get_abs_timeout(200);
+	wait_args.owner = 123;
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	close(sem_args.sem);
+	close(mutex_args.mutex);
+	close(manual_event_args.event);
+	close(auto_event_args.event);
+
+	ret = wait_for_thread(thread, 200);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(-1, thread_args.ret);
+	EXPECT_EQ(ETIMEDOUT, thread_args.err);
+
+	close(fd);
+}
+
+TEST(alert_any)
+{
+	struct ntsync_event_args event_args = {0};
+	struct ntsync_wait_args wait_args = {0};
+	struct ntsync_sem_args sem_args = {0};
+	__u32 index, count, signaled;
+	struct wait_args thread_args;
+	int objs[2], fd, ret;
+	pthread_t thread;
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	sem_args.count = 0;
+	sem_args.max = 2;
+	sem_args.sem = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, sem_args.sem);
+	objs[0] = sem_args.sem;
+
+	sem_args.count = 1;
+	sem_args.max = 2;
+	sem_args.sem = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, sem_args.sem);
+	objs[1] = sem_args.sem;
+
+	event_args.manual = true;
+	event_args.signaled = true;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_any_alert(fd, 0, NULL, 123, event_args.event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+
+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_any_alert(fd, 0, NULL, 123, event_args.event, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(1, index);
+
+	ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, index);
+
+	/* test wakeup via alert */
+
+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+
+	wait_args.timeout = get_abs_timeout(1000);
+	wait_args.objs = (uintptr_t)objs;
+	wait_args.count = 2;
+	wait_args.owner = 123;
+	wait_args.index = 0xdeadbeef;
+	wait_args.alert = event_args.event;
+	thread_args.fd = fd;
+	thread_args.args = &wait_args;
+	thread_args.request = NTSYNC_IOC_WAIT_ANY;
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(2, wait_args.index);
+
+	close(event_args.event);
+
+	/* test with an auto-reset event */
+
+	event_args.manual = false;
+	event_args.signaled = true;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_EQ(0, ret);
+
+	count = 1;
+	ret = post_sem(objs[0], &count);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+
+	ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, index);
+
+	ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	close(event_args.event);
+
+	close(objs[0]);
+	close(objs[1]);
+
+	close(fd);
+}
+
+TEST(alert_all)
+{
+	struct ntsync_event_args event_args = {0};
+	struct ntsync_wait_args wait_args = {0};
+	struct ntsync_sem_args sem_args = {0};
+	struct wait_args thread_args;
+	__u32 index, count, signaled;
+	int objs[2], fd, ret;
+	pthread_t thread;
+
+	fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, fd);
+
+	sem_args.count = 2;
+	sem_args.max = 2;
+	sem_args.sem = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, sem_args.sem);
+	objs[0] = sem_args.sem;
+
+	sem_args.count = 1;
+	sem_args.max = 2;
+	sem_args.sem = 0xdeadbeef;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args);
+	EXPECT_EQ(0, ret);
+	EXPECT_NE(0xdeadbeef, sem_args.sem);
+	objs[1] = sem_args.sem;
+
+	event_args.manual = true;
+	event_args.signaled = true;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+
+	ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, index);
+
+	/* test wakeup via alert */
+
+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled);
+	EXPECT_EQ(0, ret);
+
+	wait_args.timeout = get_abs_timeout(1000);
+	wait_args.objs = (uintptr_t)objs;
+	wait_args.count = 2;
+	wait_args.owner = 123;
+	wait_args.index = 0xdeadbeef;
+	wait_args.alert = event_args.event;
+	thread_args.fd = fd;
+	thread_args.args = &wait_args;
+	thread_args.request = NTSYNC_IOC_WAIT_ALL;
+	ret = pthread_create(&thread, NULL, wait_thread, &thread_args);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(ETIMEDOUT, ret);
+
+	ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_for_thread(thread, 100);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, thread_args.ret);
+	EXPECT_EQ(2, wait_args.index);
+
+	close(event_args.event);
+
+	/* test with an auto-reset event */
+
+	event_args.manual = false;
+	event_args.signaled = true;
+	ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_EQ(0, ret);
+
+	count = 2;
+	ret = post_sem(objs[1], &count);
+	EXPECT_EQ(0, ret);
+
+	ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(0, index);
+
+	ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index);
+	EXPECT_EQ(0, ret);
+	EXPECT_EQ(2, index);
+
+	ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index);
+	EXPECT_EQ(-1, ret);
+	EXPECT_EQ(ETIMEDOUT, errno);
+
+	close(event_args.event);
+
+	close(objs[0]);
+	close(objs[1]);
+
+	close(fd);
+}
+
+#define STRESS_LOOPS 10000
+#define STRESS_THREADS 4
+
+static unsigned int stress_counter;
+static int stress_device, stress_start_event, stress_mutex;
+
+static void *stress_thread(void *arg)
+{
+	struct ntsync_wait_args wait_args = {0};
+	__u32 index, count, i;
+	int ret;
+
+	wait_args.timeout = UINT64_MAX;
+	wait_args.count = 1;
+	wait_args.objs = (uintptr_t)&stress_start_event;
+	wait_args.owner = gettid();
+	wait_args.index = 0xdeadbeef;
+
+	ioctl(stress_device, NTSYNC_IOC_WAIT_ANY, &wait_args);
+
+	wait_args.objs = (uintptr_t)&stress_mutex;
+
+	for (i = 0; i < STRESS_LOOPS; ++i) {
+		ioctl(stress_device, NTSYNC_IOC_WAIT_ANY, &wait_args);
+
+		++stress_counter;
+
+		unlock_mutex(stress_mutex, wait_args.owner, &count);
+	}
+
+	return NULL;
+}
+
+TEST(stress_wait)
+{
+	struct ntsync_event_args event_args;
+	struct ntsync_mutex_args mutex_args;
+	pthread_t threads[STRESS_THREADS];
+	__u32 signaled, i;
+	int ret;
+
+	stress_device = open("/dev/ntsync", O_CLOEXEC | O_RDONLY);
+	ASSERT_LE(0, stress_device);
+
+	mutex_args.owner = 0;
+	mutex_args.count = 0;
+	ret = ioctl(stress_device, NTSYNC_IOC_CREATE_MUTEX, &mutex_args);
+	EXPECT_EQ(0, ret);
+	stress_mutex = mutex_args.mutex;
+
+	event_args.manual = 1;
+	event_args.signaled = 0;
+	ret = ioctl(stress_device, NTSYNC_IOC_CREATE_EVENT, &event_args);
+	EXPECT_EQ(0, ret);
+	stress_start_event = event_args.event;
+
+	for (i = 0; i < STRESS_THREADS; ++i)
+		pthread_create(&threads[i], NULL, stress_thread, NULL);
+
+	ret = ioctl(stress_start_event, NTSYNC_IOC_EVENT_SET, &signaled);
+	EXPECT_EQ(0, ret);
+
+	for (i = 0; i < STRESS_THREADS; ++i) {
+		ret = pthread_join(threads[i], NULL);
+		EXPECT_EQ(0, ret);
+	}
+
+	EXPECT_EQ(STRESS_LOOPS * STRESS_THREADS, stress_counter);
+
+	close(stress_start_event);
+	close(stress_mutex);
+	close(stress_device);
+}
+
+TEST_HARNESS_MAIN
-- 
2.46.0

From 75719451edff6bd0e09ec5d9465f025e78c1ae26 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Sun, 8 Sep 2024 10:21:43 +0200
Subject: [PATCH 10/12] perf-per-core

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 Documentation/arch/x86/topology.rst   |   4 +
 arch/x86/events/rapl.c                | 418 ++++++++++++++++++--------
 arch/x86/include/asm/processor.h      |   1 +
 arch/x86/include/asm/topology.h       |   1 +
 arch/x86/kernel/cpu/debugfs.c         |   1 +
 arch/x86/kernel/cpu/topology_common.c |   1 +
 6 files changed, 305 insertions(+), 121 deletions(-)

diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst
index 7352ab89a55a..c12837e61bda 100644
--- a/Documentation/arch/x86/topology.rst
+++ b/Documentation/arch/x86/topology.rst
@@ -135,6 +135,10 @@ Thread-related topology information in the kernel:
     The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo
     "core_id."
 
+  - topology_logical_core_id();
+
+    The logical core ID to which a thread belongs.
+
 
 
 System topology examples
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 0c5e7a7c43ac..cd808b699ccc 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -39,6 +39,10 @@
  *	  event: rapl_energy_psys
  *    perf code: 0x5
  *
+ *  per_core counter: consumption of a single physical core
+ *	  event: rapl_energy_per_core (power_per_core PMU)
+ *    perf code: 0x1
+ *
  * We manage those counters as free running (read-only). They may be
  * use simultaneously by other tools, such as turbostat.
  *
@@ -70,18 +74,25 @@ MODULE_LICENSE("GPL");
 /*
  * RAPL energy status counters
  */
-enum perf_rapl_events {
+enum perf_rapl_pkg_events {
 	PERF_RAPL_PP0 = 0,		/* all cores */
 	PERF_RAPL_PKG,			/* entire package */
 	PERF_RAPL_RAM,			/* DRAM */
 	PERF_RAPL_PP1,			/* gpu */
 	PERF_RAPL_PSYS,			/* psys */
 
-	PERF_RAPL_MAX,
-	NR_RAPL_DOMAINS = PERF_RAPL_MAX,
+	PERF_RAPL_PKG_EVENTS_MAX,
+	NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX,
+};
+
+enum perf_rapl_core_events {
+	PERF_RAPL_PER_CORE = 0,		/* per-core */
+
+	PERF_RAPL_CORE_EVENTS_MAX,
+	NR_RAPL_CORE_DOMAINS = PERF_RAPL_CORE_EVENTS_MAX,
 };
 
-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = {
 	"pp0-core",
 	"package",
 	"dram",
@@ -89,6 +100,10 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
 	"psys",
 };
 
+static const char *const rapl_core_domain_names[NR_RAPL_CORE_DOMAINS] __initconst = {
+	"per-core",
+};
+
 /*
  * event code: LSB 8 bits, passed in attr->config
  * any other bit is reserved
@@ -103,6 +118,10 @@ static struct perf_pmu_events_attr event_attr_##v = {				\
 	.event_str	= str,							\
 };
 
+#define rapl_pmu_is_pkg_scope()				\
+	(boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||	\
+	 boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
+
 struct rapl_pmu {
 	raw_spinlock_t		lock;
 	int			n_active;
@@ -115,8 +134,9 @@ struct rapl_pmu {
 
 struct rapl_pmus {
 	struct pmu		pmu;
+	cpumask_t		cpumask;
 	unsigned int		nr_rapl_pmu;
-	struct rapl_pmu		*pmus[] __counted_by(nr_rapl_pmu);
+	struct rapl_pmu		*rapl_pmu[] __counted_by(nr_rapl_pmu);
 };
 
 enum rapl_unit_quirk {
@@ -126,29 +146,45 @@ enum rapl_unit_quirk {
 };
 
 struct rapl_model {
-	struct perf_msr *rapl_msrs;
-	unsigned long	events;
+	struct perf_msr *rapl_pkg_msrs;
+	struct perf_msr *rapl_core_msrs;
+	unsigned long	pkg_events;
+	unsigned long	core_events;
 	unsigned int	msr_power_unit;
 	enum rapl_unit_quirk	unit_quirk;
 };
 
  /* 1/2^hw_unit Joule */
-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
-static struct rapl_pmus *rapl_pmus;
-static cpumask_t rapl_cpu_mask;
-static unsigned int rapl_cntr_mask;
+static int rapl_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly;
+static struct rapl_pmus *rapl_pmus_pkg;
+static struct rapl_pmus *rapl_pmus_core;
+static unsigned int rapl_pkg_cntr_mask;
+static unsigned int rapl_core_cntr_mask;
 static u64 rapl_timer_ms;
-static struct perf_msr *rapl_msrs;
+static struct rapl_model *rapl_model;
+
+static inline unsigned int get_rapl_pmu_idx(int cpu)
+{
+	return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) :
+					 topology_logical_die_id(cpu);
+}
+
+static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu)
+{
+	return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) :
+					 topology_die_cpumask(cpu);
+}
 
 static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
 {
-	unsigned int rapl_pmu_idx = topology_logical_die_id(cpu);
+	unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu);
 
 	/*
 	 * The unsigned check also catches the '-1' return value for non
 	 * existent mappings in the topology map.
 	 */
-	return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL;
+	return rapl_pmu_idx < rapl_pmus_pkg->nr_rapl_pmu ?
+	       rapl_pmus_pkg->rapl_pmu[rapl_pmu_idx] : NULL;
 }
 
 static inline u64 rapl_read_counter(struct perf_event *event)
@@ -160,7 +196,7 @@ static inline u64 rapl_read_counter(struct perf_event *event)
 
 static inline u64 rapl_scale(u64 v, int cfg)
 {
-	if (cfg > NR_RAPL_DOMAINS) {
+	if (cfg > NR_RAPL_PKG_DOMAINS) {
 		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
 		return v;
 	}
@@ -212,34 +248,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 
 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
 {
-	struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
+	struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
 	struct perf_event *event;
 	unsigned long flags;
 
-	if (!pmu->n_active)
+	if (!rapl_pmu->n_active)
 		return HRTIMER_NORESTART;
 
-	raw_spin_lock_irqsave(&pmu->lock, flags);
+	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
 
-	list_for_each_entry(event, &pmu->active_list, active_entry)
+	list_for_each_entry(event, &rapl_pmu->active_list, active_entry)
 		rapl_event_update(event);
 
-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
+	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
 
-	hrtimer_forward_now(hrtimer, pmu->timer_interval);
+	hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval);
 
 	return HRTIMER_RESTART;
 }
 
-static void rapl_hrtimer_init(struct rapl_pmu *pmu)
+static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu)
 {
-	struct hrtimer *hr = &pmu->hrtimer;
+	struct hrtimer *hr = &rapl_pmu->hrtimer;
 
 	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hr->function = rapl_hrtimer_handle;
 }
 
-static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
+static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu,
 				   struct perf_event *event)
 {
 	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
@@ -247,39 +283,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
 
 	event->hw.state = 0;
 
-	list_add_tail(&event->active_entry, &pmu->active_list);
+	list_add_tail(&event->active_entry, &rapl_pmu->active_list);
 
 	local64_set(&event->hw.prev_count, rapl_read_counter(event));
 
-	pmu->n_active++;
-	if (pmu->n_active == 1)
-		rapl_start_hrtimer(pmu);
+	rapl_pmu->n_active++;
+	if (rapl_pmu->n_active == 1)
+		rapl_start_hrtimer(rapl_pmu);
 }
 
 static void rapl_pmu_event_start(struct perf_event *event, int mode)
 {
-	struct rapl_pmu *pmu = event->pmu_private;
+	struct rapl_pmu *rapl_pmu = event->pmu_private;
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&pmu->lock, flags);
-	__rapl_pmu_event_start(pmu, event);
-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
+	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
+	__rapl_pmu_event_start(rapl_pmu, event);
+	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
 }
 
 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 {
-	struct rapl_pmu *pmu = event->pmu_private;
+	struct rapl_pmu *rapl_pmu = event->pmu_private;
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&pmu->lock, flags);
+	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
 
 	/* mark event as deactivated and stopped */
 	if (!(hwc->state & PERF_HES_STOPPED)) {
-		WARN_ON_ONCE(pmu->n_active <= 0);
-		pmu->n_active--;
-		if (pmu->n_active == 0)
-			hrtimer_cancel(&pmu->hrtimer);
+		WARN_ON_ONCE(rapl_pmu->n_active <= 0);
+		rapl_pmu->n_active--;
+		if (rapl_pmu->n_active == 0)
+			hrtimer_cancel(&rapl_pmu->hrtimer);
 
 		list_del(&event->active_entry);
 
@@ -297,23 +333,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode)
 		hwc->state |= PERF_HES_UPTODATE;
 	}
 
-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
+	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
 }
 
 static int rapl_pmu_event_add(struct perf_event *event, int mode)
 {
-	struct rapl_pmu *pmu = event->pmu_private;
+	struct rapl_pmu *rapl_pmu = event->pmu_private;
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&pmu->lock, flags);
+	raw_spin_lock_irqsave(&rapl_pmu->lock, flags);
 
 	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
 
 	if (mode & PERF_EF_START)
-		__rapl_pmu_event_start(pmu, event);
+		__rapl_pmu_event_start(rapl_pmu, event);
 
-	raw_spin_unlock_irqrestore(&pmu->lock, flags);
+	raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags);
 
 	return 0;
 }
@@ -327,10 +363,14 @@ static int rapl_pmu_event_init(struct perf_event *event)
 {
 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
 	int bit, ret = 0;
-	struct rapl_pmu *pmu;
+	struct rapl_pmu *rapl_pmu;
+	struct rapl_pmus *curr_rapl_pmus;
 
 	/* only look at RAPL events */
-	if (event->attr.type != rapl_pmus->pmu.type)
+	if (event->attr.type == rapl_pmus_pkg->pmu.type ||
+		(rapl_pmus_core && event->attr.type == rapl_pmus_core->pmu.type))
+		curr_rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu);
+	else
 		return -ENOENT;
 
 	/* check only supported bits are set */
@@ -340,16 +380,18 @@ static int rapl_pmu_event_init(struct perf_event *event)
 	if (event->cpu < 0)
 		return -EINVAL;
 
-	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+	if (curr_rapl_pmus == rapl_pmus_pkg)
+		event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
 
-	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
+	if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1)
 		return -EINVAL;
 
-	cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
+	cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1);
 	bit = cfg - 1;
 
 	/* check event supported */
-	if (!(rapl_cntr_mask & (1 << bit)))
+	if (!(rapl_pkg_cntr_mask & (1 << bit)) &&
+	    !(rapl_core_cntr_mask & (1 << bit)))
 		return -EINVAL;
 
 	/* unsupported modes and filters */
@@ -357,12 +399,18 @@ static int rapl_pmu_event_init(struct perf_event *event)
 		return -EINVAL;
 
 	/* must be done before validate_group */
-	pmu = cpu_to_rapl_pmu(event->cpu);
-	if (!pmu)
+	if (curr_rapl_pmus == rapl_pmus_core) {
+		rapl_pmu = curr_rapl_pmus->rapl_pmu[topology_logical_core_id(event->cpu)];
+		event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr;
+	} else {
+		rapl_pmu = curr_rapl_pmus->rapl_pmu[get_rapl_pmu_idx(event->cpu)];
+		event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr;
+	}
+
+	if (!rapl_pmu)
 		return -EINVAL;
-	event->cpu = pmu->cpu;
-	event->pmu_private = pmu;
-	event->hw.event_base = rapl_msrs[bit].msr;
+	event->cpu = rapl_pmu->cpu;
+	event->pmu_private = rapl_pmu;
 	event->hw.config = cfg;
 	event->hw.idx = bit;
 
@@ -377,7 +425,7 @@ static void rapl_pmu_event_read(struct perf_event *event)
 static ssize_t rapl_get_attr_cpumask(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
+	return cpumap_print_to_pagebuf(true, buf, &rapl_pmus_pkg->cpumask);
 }
 
 static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
@@ -391,17 +439,38 @@ static struct attribute_group rapl_pmu_attr_group = {
 	.attrs = rapl_pmu_attrs,
 };
 
+static ssize_t rapl_get_attr_per_core_cpumask(struct device *dev,
+					     struct device_attribute *attr, char *buf)
+{
+	return cpumap_print_to_pagebuf(true, buf, &rapl_pmus_core->cpumask);
+}
+
+static struct device_attribute dev_attr_per_core_cpumask = __ATTR(cpumask, 0444,
+								 rapl_get_attr_per_core_cpumask,
+								 NULL);
+
+static struct attribute *rapl_pmu_per_core_attrs[] = {
+	&dev_attr_per_core_cpumask.attr,
+	NULL,
+};
+
+static struct attribute_group rapl_pmu_per_core_attr_group = {
+	.attrs = rapl_pmu_per_core_attrs,
+};
+
 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
 RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
 RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
 RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
 RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
+RAPL_EVENT_ATTR_STR(energy-per-core,   rapl_per_core, "event=0x01");
 
 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
 RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
 RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
 RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
 RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
+RAPL_EVENT_ATTR_STR(energy-per-core.unit,   rapl_per_core_unit, "Joules");
 
 /*
  * we compute in 0.23 nJ increments regardless of MSR
@@ -411,6 +480,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890
 RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
 RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
 RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-per-core.scale,   rapl_per_core_scale, "2.3283064365386962890625e-10");
 
 /*
  * There are no default events, but we need to create
@@ -444,6 +514,13 @@ static const struct attribute_group *rapl_attr_groups[] = {
 	NULL,
 };
 
+static const struct attribute_group *rapl_per_core_attr_groups[] = {
+	&rapl_pmu_per_core_attr_group,
+	&rapl_pmu_format_group,
+	&rapl_pmu_events_group,
+	NULL,
+};
+
 static struct attribute *rapl_events_cores[] = {
 	EVENT_PTR(rapl_cores),
 	EVENT_PTR(rapl_cores_unit),
@@ -504,6 +581,18 @@ static struct attribute_group rapl_events_psys_group = {
 	.attrs = rapl_events_psys,
 };
 
+static struct attribute *rapl_events_per_core[] = {
+	EVENT_PTR(rapl_per_core),
+	EVENT_PTR(rapl_per_core_unit),
+	EVENT_PTR(rapl_per_core_scale),
+	NULL,
+};
+
+static struct attribute_group rapl_events_per_core_group = {
+	.name  = "events",
+	.attrs = rapl_events_per_core,
+};
+
 static bool test_msr(int idx, void *data)
 {
 	return test_bit(idx, (unsigned long *) data);
@@ -529,11 +618,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
 };
 
 /*
- * Force to PERF_RAPL_MAX size due to:
- * - perf_msr_probe(PERF_RAPL_MAX)
+ * Force to PERF_RAPL_PKG_EVENTS_MAX size due to:
+ * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX)
  * - want to use same event codes across both architectures
  */
-static struct perf_msr amd_rapl_msrs[] = {
+static struct perf_msr amd_rapl_pkg_msrs[] = {
 	[PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, NULL, false, 0 },
 	[PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
 	[PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   NULL, false, 0 },
@@ -541,72 +630,104 @@ static struct perf_msr amd_rapl_msrs[] = {
 	[PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group,  NULL, false, 0 },
 };
 
-static int rapl_cpu_offline(unsigned int cpu)
+static struct perf_msr amd_rapl_core_msrs[] = {
+	[PERF_RAPL_PER_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_per_core_group,
+				 test_msr, false, RAPL_MSR_MASK },
+};
+
+static int __rapl_cpu_offline(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu_idx,
+			      const struct cpumask *event_cpumask, unsigned int cpu)
 {
-	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+	struct rapl_pmu *rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
 	int target;
 
 	/* Check if exiting cpu is used for collecting rapl events */
-	if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
+	if (!cpumask_test_and_clear_cpu(cpu, &rapl_pmus->cpumask))
 		return 0;
 
-	pmu->cpu = -1;
+	rapl_pmu->cpu = -1;
 	/* Find a new cpu to collect rapl events */
-	target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
+	target = cpumask_any_but(event_cpumask, cpu);
 
 	/* Migrate rapl events to the new target */
 	if (target < nr_cpu_ids) {
-		cpumask_set_cpu(target, &rapl_cpu_mask);
-		pmu->cpu = target;
-		perf_pmu_migrate_context(pmu->pmu, cpu, target);
+		cpumask_set_cpu(target, &rapl_pmus->cpumask);
+		rapl_pmu->cpu = target;
+		perf_pmu_migrate_context(rapl_pmu->pmu, cpu, target);
 	}
 	return 0;
 }
 
-static int rapl_cpu_online(unsigned int cpu)
+static int rapl_cpu_offline(unsigned int cpu)
+{
+	int ret =  __rapl_cpu_offline(rapl_pmus_pkg, get_rapl_pmu_idx(cpu),
+				  get_rapl_pmu_cpumask(cpu), cpu);
+
+	if (ret == 0 && rapl_model->core_events)
+		ret = __rapl_cpu_offline(rapl_pmus_core, topology_logical_core_id(cpu),
+				   topology_sibling_cpumask(cpu), cpu);
+
+	return ret;
+}
+
+static int __rapl_cpu_online(struct rapl_pmus *rapl_pmus, unsigned int rapl_pmu_idx,
+			     const struct cpumask *event_cpumask, unsigned int cpu)
 {
-	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
+	struct rapl_pmu *rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx];
 	int target;
 
-	if (!pmu) {
-		pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
-		if (!pmu)
+	if (!rapl_pmu) {
+		rapl_pmu = kzalloc_node(sizeof(*rapl_pmu), GFP_KERNEL, cpu_to_node(cpu));
+		if (!rapl_pmu)
 			return -ENOMEM;
 
-		raw_spin_lock_init(&pmu->lock);
-		INIT_LIST_HEAD(&pmu->active_list);
-		pmu->pmu = &rapl_pmus->pmu;
-		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
-		rapl_hrtimer_init(pmu);
+		raw_spin_lock_init(&rapl_pmu->lock);
+		INIT_LIST_HEAD(&rapl_pmu->active_list);
+		rapl_pmu->pmu = &rapl_pmus->pmu;
+		rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
+		rapl_hrtimer_init(rapl_pmu);
 
-		rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
+		rapl_pmus->rapl_pmu[rapl_pmu_idx] = rapl_pmu;
 	}
 
 	/*
 	 * Check if there is an online cpu in the package which collects rapl
 	 * events already.
 	 */
-	target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
+	target = cpumask_any_and(&rapl_pmus->cpumask, event_cpumask);
 	if (target < nr_cpu_ids)
 		return 0;
 
-	cpumask_set_cpu(cpu, &rapl_cpu_mask);
-	pmu->cpu = cpu;
+	cpumask_set_cpu(cpu, &rapl_pmus->cpumask);
+	rapl_pmu->cpu = cpu;
 	return 0;
 }
 
-static int rapl_check_hw_unit(struct rapl_model *rm)
+static int rapl_cpu_online(unsigned int cpu)
+{
+	int ret =  __rapl_cpu_online(rapl_pmus_pkg, get_rapl_pmu_idx(cpu),
+				 get_rapl_pmu_cpumask(cpu), cpu);
+
+	if (ret == 0 && rapl_model->core_events)
+		ret = __rapl_cpu_online(rapl_pmus_core, topology_logical_core_id(cpu),
+				   topology_sibling_cpumask(cpu), cpu);
+
+	return ret;
+}
+
+
+static int rapl_check_hw_unit(void)
 {
 	u64 msr_rapl_power_unit_bits;
 	int i;
 
 	/* protect rdmsrl() to handle virtualization */
-	if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits))
+	if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits))
 		return -1;
-	for (i = 0; i < NR_RAPL_DOMAINS; i++)
+	for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++)
 		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 
-	switch (rm->unit_quirk) {
+	switch (rapl_model->unit_quirk) {
 	/*
 	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
 	 * different than the unit from power unit MSR. See
@@ -645,22 +766,29 @@ static void __init rapl_advertise(void)
 	int i;
 
 	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
-		hweight32(rapl_cntr_mask), rapl_timer_ms);
+		hweight32(rapl_pkg_cntr_mask) + hweight32(rapl_core_cntr_mask), rapl_timer_ms);
 
-	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
-		if (rapl_cntr_mask & (1 << i)) {
+	for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) {
+		if (rapl_pkg_cntr_mask & (1 << i)) {
 			pr_info("hw unit of domain %s 2^-%d Joules\n",
-				rapl_domain_names[i], rapl_hw_unit[i]);
+				rapl_pkg_domain_names[i], rapl_hw_unit[i]);
+		}
+	}
+
+	for (i = 0; i < NR_RAPL_CORE_DOMAINS; i++) {
+		if (rapl_core_cntr_mask & (1 << i)) {
+			pr_info("hw unit of domain %s 2^-%d Joules\n",
+				rapl_core_domain_names[i], rapl_hw_unit[i]);
 		}
 	}
 }
 
-static void cleanup_rapl_pmus(void)
+static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus)
 {
 	int i;
 
 	for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++)
-		kfree(rapl_pmus->pmus[i]);
+		kfree(rapl_pmus->rapl_pmu[i]);
 	kfree(rapl_pmus);
 }
 
@@ -673,11 +801,17 @@ static const struct attribute_group *rapl_attr_update[] = {
 	NULL,
 };
 
-static int __init init_rapl_pmus(void)
+static const struct attribute_group *rapl_per_core_attr_update[] = {
+	&rapl_events_per_core_group,
+};
+
+static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int nr_rapl_pmu,
+				 const struct attribute_group **rapl_attr_groups,
+				 const struct attribute_group **rapl_attr_update)
 {
-	int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
+	struct rapl_pmus *rapl_pmus;
 
-	rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL);
+	rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL);
 	if (!rapl_pmus)
 		return -ENOMEM;
 
@@ -693,75 +827,80 @@ static int __init init_rapl_pmus(void)
 	rapl_pmus->pmu.read		= rapl_pmu_event_read;
 	rapl_pmus->pmu.module		= THIS_MODULE;
 	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
+
+	*rapl_pmus_ptr = rapl_pmus;
+
 	return 0;
 }
 
 static struct rapl_model model_snb = {
-	.events		= BIT(PERF_RAPL_PP0) |
+	.pkg_events	= BIT(PERF_RAPL_PP0) |
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_PP1),
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
-	.rapl_msrs      = intel_rapl_msrs,
+	.rapl_pkg_msrs	= intel_rapl_msrs,
 };
 
 static struct rapl_model model_snbep = {
-	.events		= BIT(PERF_RAPL_PP0) |
+	.pkg_events	= BIT(PERF_RAPL_PP0) |
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM),
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
-	.rapl_msrs      = intel_rapl_msrs,
+	.rapl_pkg_msrs	= intel_rapl_msrs,
 };
 
 static struct rapl_model model_hsw = {
-	.events		= BIT(PERF_RAPL_PP0) |
+	.pkg_events	= BIT(PERF_RAPL_PP0) |
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM) |
 			  BIT(PERF_RAPL_PP1),
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
-	.rapl_msrs      = intel_rapl_msrs,
+	.rapl_pkg_msrs	= intel_rapl_msrs,
 };
 
 static struct rapl_model model_hsx = {
-	.events		= BIT(PERF_RAPL_PP0) |
+	.pkg_events	= BIT(PERF_RAPL_PP0) |
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM),
 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
-	.rapl_msrs      = intel_rapl_msrs,
+	.rapl_pkg_msrs	= intel_rapl_msrs,
 };
 
 static struct rapl_model model_knl = {
-	.events		= BIT(PERF_RAPL_PKG) |
+	.pkg_events	= BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM),
 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_HSW,
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
-	.rapl_msrs      = intel_rapl_msrs,
+	.rapl_pkg_msrs	= intel_rapl_msrs,
 };
 
 static struct rapl_model model_skl = {
-	.events		= BIT(PERF_RAPL_PP0) |
+	.pkg_events	= BIT(PERF_RAPL_PP0) |
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM) |
 			  BIT(PERF_RAPL_PP1) |
 			  BIT(PERF_RAPL_PSYS),
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
-	.rapl_msrs      = intel_rapl_msrs,
+	.rapl_pkg_msrs	= intel_rapl_msrs,
 };
 
 static struct rapl_model model_spr = {
-	.events		= BIT(PERF_RAPL_PP0) |
+	.pkg_events	= BIT(PERF_RAPL_PP0) |
 			  BIT(PERF_RAPL_PKG) |
 			  BIT(PERF_RAPL_RAM) |
 			  BIT(PERF_RAPL_PSYS),
 	.unit_quirk	= RAPL_UNIT_QUIRK_INTEL_SPR,
 	.msr_power_unit = MSR_RAPL_POWER_UNIT,
-	.rapl_msrs      = intel_rapl_spr_msrs,
+	.rapl_pkg_msrs	= intel_rapl_spr_msrs,
 };
 
 static struct rapl_model model_amd_hygon = {
-	.events		= BIT(PERF_RAPL_PKG),
+	.pkg_events	= BIT(PERF_RAPL_PKG),
+	.core_events	= BIT(PERF_RAPL_PER_CORE),
 	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
-	.rapl_msrs      = amd_rapl_msrs,
+	.rapl_pkg_msrs	= amd_rapl_pkg_msrs,
+	.rapl_core_msrs	= amd_rapl_core_msrs,
 };
 
 static const struct x86_cpu_id rapl_model_match[] __initconst = {
@@ -817,28 +956,47 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
 static int __init rapl_pmu_init(void)
 {
 	const struct x86_cpu_id *id;
-	struct rapl_model *rm;
 	int ret;
+	int nr_rapl_pmu = topology_max_packages() * topology_max_dies_per_package();
+	int nr_cores = topology_max_packages() * topology_num_cores_per_package();
+
+	if (rapl_pmu_is_pkg_scope())
+		nr_rapl_pmu = topology_max_packages();
 
 	id = x86_match_cpu(rapl_model_match);
 	if (!id)
 		return -ENODEV;
 
-	rm = (struct rapl_model *) id->driver_data;
-
-	rapl_msrs = rm->rapl_msrs;
+	rapl_model = (struct rapl_model *) id->driver_data;
 
-	rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
-					false, (void *) &rm->events);
+	rapl_pkg_cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, PERF_RAPL_PKG_EVENTS_MAX,
+					false, (void *) &rapl_model->pkg_events);
 
-	ret = rapl_check_hw_unit(rm);
+	ret = rapl_check_hw_unit();
 	if (ret)
 		return ret;
 
-	ret = init_rapl_pmus();
+	ret = init_rapl_pmus(&rapl_pmus_pkg, nr_rapl_pmu, rapl_attr_groups, rapl_attr_update);
 	if (ret)
 		return ret;
 
+	if (rapl_model->core_events) {
+		rapl_core_cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs,
+						     PERF_RAPL_CORE_EVENTS_MAX, false,
+						     (void *) &rapl_model->core_events);
+
+		ret = init_rapl_pmus(&rapl_pmus_core, nr_cores,
+				     rapl_per_core_attr_groups, rapl_per_core_attr_update);
+		if (ret) {
+			/*
+			 * If initialization of per_core PMU fails, reset per_core
+			 * flag, and continue with power PMU initialization.
+			 */
+			pr_warn("Per-core PMU initialization failed (%d)\n", ret);
+			rapl_model->core_events = 0UL;
+		}
+	}
+
 	/*
 	 * Install callbacks. Core will call them for each online cpu.
 	 */
@@ -848,10 +1006,24 @@ static int __init rapl_pmu_init(void)
 	if (ret)
 		goto out;
 
-	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
+	ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1);
 	if (ret)
 		goto out1;
 
+	if (rapl_model->core_events) {
+		ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_per_core", -1);
+		if (ret) {
+			/*
+			 * If registration of per_core PMU fails, cleanup per_core PMU
+			 * variables, reset the per_core flag and keep the
+			 * power PMU untouched.
+			 */
+			pr_warn("Per-core PMU registration failed (%d)\n", ret);
+			cleanup_rapl_pmus(rapl_pmus_core);
+			rapl_model->core_events = 0UL;
+		}
+	}
+
 	rapl_advertise();
 	return 0;
 
@@ -859,7 +1031,7 @@ static int __init rapl_pmu_init(void)
 	cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
 out:
 	pr_warn("Initialization failed (%d), disabled\n", ret);
-	cleanup_rapl_pmus();
+	cleanup_rapl_pmus(rapl_pmus_pkg);
 	return ret;
 }
 module_init(rapl_pmu_init);
@@ -867,7 +1039,11 @@ module_init(rapl_pmu_init);
 static void __exit intel_rapl_exit(void)
 {
 	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
-	perf_pmu_unregister(&rapl_pmus->pmu);
-	cleanup_rapl_pmus();
+	perf_pmu_unregister(&rapl_pmus_pkg->pmu);
+	cleanup_rapl_pmus(rapl_pmus_pkg);
+	if (rapl_model->core_events) {
+		perf_pmu_unregister(&rapl_pmus_core->pmu);
+		cleanup_rapl_pmus(rapl_pmus_core);
+	}
 }
 module_exit(intel_rapl_exit);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cb4f6c513c48..1ffe4260bef6 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -98,6 +98,7 @@ struct cpuinfo_topology {
 	// Logical ID mappings
 	u32			logical_pkg_id;
 	u32			logical_die_id;
+	u32			logical_core_id;
 
 	// AMD Node ID and Nodes per Package info
 	u32			amd_node_id;
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index aef70336d624..672fccf9f845 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -137,6 +137,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu);
 #define topology_logical_package_id(cpu)	(cpu_data(cpu).topo.logical_pkg_id)
 #define topology_physical_package_id(cpu)	(cpu_data(cpu).topo.pkg_id)
 #define topology_logical_die_id(cpu)		(cpu_data(cpu).topo.logical_die_id)
+#define topology_logical_core_id(cpu)		(cpu_data(cpu).topo.logical_core_id)
 #define topology_die_id(cpu)			(cpu_data(cpu).topo.die_id)
 #define topology_core_id(cpu)			(cpu_data(cpu).topo.core_id)
 #define topology_ppin(cpu)			(cpu_data(cpu).ppin)
diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c
index 3baf3e435834..b1eb6d7828db 100644
--- a/arch/x86/kernel/cpu/debugfs.c
+++ b/arch/x86/kernel/cpu/debugfs.c
@@ -24,6 +24,7 @@ static int cpu_debug_show(struct seq_file *m, void *p)
 	seq_printf(m, "core_id:             %u\n", c->topo.core_id);
 	seq_printf(m, "logical_pkg_id:      %u\n", c->topo.logical_pkg_id);
 	seq_printf(m, "logical_die_id:      %u\n", c->topo.logical_die_id);
+	seq_printf(m, "logical_core_id:     %u\n", c->topo.logical_core_id);
 	seq_printf(m, "llc_id:              %u\n", c->topo.llc_id);
 	seq_printf(m, "l2c_id:              %u\n", c->topo.l2c_id);
 	seq_printf(m, "amd_node_id:         %u\n", c->topo.amd_node_id);
diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c
index 9a6069e7133c..23722aa21e2f 100644
--- a/arch/x86/kernel/cpu/topology_common.c
+++ b/arch/x86/kernel/cpu/topology_common.c
@@ -151,6 +151,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early)
 	if (!early) {
 		c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN);
 		c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN);
+		c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN);
 	}
 
 	/* Package relative core ID */
-- 
2.46.0

From dc94473290be40f81a6153d3ab318e7d1f213d15 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Thu, 12 Sep 2024 11:47:36 +0200
Subject: [PATCH 11/12] t2

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 .../ABI/testing/sysfs-driver-hid-appletb-kbd  |   13 +
 Documentation/core-api/printk-formats.rst     |   32 +
 MAINTAINERS                                   |   12 +
 drivers/acpi/video_detect.c                   |   16 +
 drivers/firmware/efi/libstub/Makefile         |    2 +-
 drivers/firmware/efi/libstub/arm64.c          |    3 +-
 drivers/firmware/efi/libstub/efistub.h        |    9 +-
 drivers/firmware/efi/libstub/smbios.c         |   43 +-
 drivers/firmware/efi/libstub/x86-stub.c       |   71 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       |    3 +
 drivers/gpu/drm/drm_format_helper.c           |   54 +
 drivers/gpu/drm/i915/display/intel_ddi.c      |    4 +
 drivers/gpu/drm/i915/display/intel_fbdev.c    |    6 +-
 drivers/gpu/drm/i915/display/intel_quirks.c   |   15 +
 drivers/gpu/drm/i915/display/intel_quirks.h   |    1 +
 .../gpu/drm/tests/drm_format_helper_test.c    |   81 ++
 drivers/gpu/drm/tiny/Kconfig                  |   12 +
 drivers/gpu/drm/tiny/Makefile                 |    1 +
 drivers/gpu/drm/tiny/appletbdrm.c             |  624 +++++++++
 drivers/gpu/vga/vga_switcheroo.c              |    7 +-
 drivers/hid/Kconfig                           |   22 +
 drivers/hid/Makefile                          |    2 +
 drivers/hid/hid-apple.c                       |   87 ++
 drivers/hid/hid-appletb-bl.c                  |  206 +++
 drivers/hid/hid-appletb-kbd.c                 |  432 +++++++
 drivers/hid/hid-core.c                        |   25 +
 drivers/hid/hid-google-hammer.c               |   27 +-
 drivers/hid/hid-multitouch.c                  |   60 +-
 drivers/hid/hid-quirks.c                      |    8 +-
 drivers/hwmon/applesmc.c                      | 1138 ++++++++++++-----
 drivers/input/mouse/bcm5974.c                 |  138 ++
 .../broadcom/brcm80211/brcmfmac/cfg80211.c    |   13 +-
 drivers/pci/vgaarb.c                          |    1 +
 drivers/platform/x86/apple-gmux.c             |   18 +
 drivers/staging/Kconfig                       |    2 +
 drivers/staging/Makefile                      |    1 +
 drivers/staging/apple-bce/Kconfig             |   18 +
 drivers/staging/apple-bce/Makefile            |   28 +
 drivers/staging/apple-bce/apple_bce.c         |  445 +++++++
 drivers/staging/apple-bce/apple_bce.h         |   38 +
 drivers/staging/apple-bce/audio/audio.c       |  711 ++++++++++
 drivers/staging/apple-bce/audio/audio.h       |  125 ++
 drivers/staging/apple-bce/audio/description.h |   42 +
 drivers/staging/apple-bce/audio/pcm.c         |  308 +++++
 drivers/staging/apple-bce/audio/pcm.h         |   16 +
 drivers/staging/apple-bce/audio/protocol.c    |  347 +++++
 drivers/staging/apple-bce/audio/protocol.h    |  147 +++
 .../staging/apple-bce/audio/protocol_bce.c    |  226 ++++
 .../staging/apple-bce/audio/protocol_bce.h    |   72 ++
 drivers/staging/apple-bce/mailbox.c           |  151 +++
 drivers/staging/apple-bce/mailbox.h           |   53 +
 drivers/staging/apple-bce/queue.c             |  390 ++++++
 drivers/staging/apple-bce/queue.h             |  177 +++
 drivers/staging/apple-bce/queue_dma.c         |  220 ++++
 drivers/staging/apple-bce/queue_dma.h         |   50 +
 drivers/staging/apple-bce/vhci/command.h      |  204 +++
 drivers/staging/apple-bce/vhci/queue.c        |  268 ++++
 drivers/staging/apple-bce/vhci/queue.h        |   76 ++
 drivers/staging/apple-bce/vhci/transfer.c     |  661 ++++++++++
 drivers/staging/apple-bce/vhci/transfer.h     |   73 ++
 drivers/staging/apple-bce/vhci/vhci.c         |  759 +++++++++++
 drivers/staging/apple-bce/vhci/vhci.h         |   52 +
 drivers/usb/core/driver.c                     |   14 +
 drivers/usb/storage/uas.c                     |    5 +-
 include/drm/drm_format_helper.h               |    3 +
 include/linux/efi.h                           |    5 +-
 include/linux/hid.h                           |    2 +
 include/linux/usb.h                           |    3 +
 lib/test_printf.c                             |   20 +-
 lib/vsprintf.c                                |   36 +-
 scripts/checkpatch.pl                         |    2 +-
 71 files changed, 8533 insertions(+), 403 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd
 create mode 100644 drivers/gpu/drm/tiny/appletbdrm.c
 create mode 100644 drivers/hid/hid-appletb-bl.c
 create mode 100644 drivers/hid/hid-appletb-kbd.c
 create mode 100644 drivers/staging/apple-bce/Kconfig
 create mode 100644 drivers/staging/apple-bce/Makefile
 create mode 100644 drivers/staging/apple-bce/apple_bce.c
 create mode 100644 drivers/staging/apple-bce/apple_bce.h
 create mode 100644 drivers/staging/apple-bce/audio/audio.c
 create mode 100644 drivers/staging/apple-bce/audio/audio.h
 create mode 100644 drivers/staging/apple-bce/audio/description.h
 create mode 100644 drivers/staging/apple-bce/audio/pcm.c
 create mode 100644 drivers/staging/apple-bce/audio/pcm.h
 create mode 100644 drivers/staging/apple-bce/audio/protocol.c
 create mode 100644 drivers/staging/apple-bce/audio/protocol.h
 create mode 100644 drivers/staging/apple-bce/audio/protocol_bce.c
 create mode 100644 drivers/staging/apple-bce/audio/protocol_bce.h
 create mode 100644 drivers/staging/apple-bce/mailbox.c
 create mode 100644 drivers/staging/apple-bce/mailbox.h
 create mode 100644 drivers/staging/apple-bce/queue.c
 create mode 100644 drivers/staging/apple-bce/queue.h
 create mode 100644 drivers/staging/apple-bce/queue_dma.c
 create mode 100644 drivers/staging/apple-bce/queue_dma.h
 create mode 100644 drivers/staging/apple-bce/vhci/command.h
 create mode 100644 drivers/staging/apple-bce/vhci/queue.c
 create mode 100644 drivers/staging/apple-bce/vhci/queue.h
 create mode 100644 drivers/staging/apple-bce/vhci/transfer.c
 create mode 100644 drivers/staging/apple-bce/vhci/transfer.h
 create mode 100644 drivers/staging/apple-bce/vhci/vhci.c
 create mode 100644 drivers/staging/apple-bce/vhci/vhci.h

diff --git a/Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd b/Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd
new file mode 100644
index 000000000000..2a19584d091e
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-hid-appletb-kbd
@@ -0,0 +1,13 @@
+What:		/sys/bus/hid/drivers/hid-appletb-kbd/<dev>/mode
+Date:		September, 2023
+KernelVersion:	6.5
+Contact:	linux-input@vger.kernel.org
+Description:
+		The set of keys displayed on the Touch Bar.
+		Valid values are:
+		== =================
+		0  Escape key only
+		1  Function keys
+		2  Media/brightness keys
+		3  None
+		== =================
diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst
index 4451ef501936..c726a846f752 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -632,6 +632,38 @@ Examples::
 	%p4cc	Y10  little-endian (0x20303159)
 	%p4cc	NV12 big-endian (0xb231564e)
 
+Generic FourCC code
+-------------------
+
+::
+	%p4c[hnbl]	gP00 (0x67503030)
+
+Print a generic FourCC code, as both ASCII characters and its numerical
+value as hexadecimal.
+
+The additional ``h``, ``r``, ``b``, and ``l`` specifiers are used to specify
+host, reversed, big or little endian order data respectively. Host endian
+order means the data is interpreted as a 32-bit integer and the most
+significant byte is printed first; that is, the character code as printed
+matches the byte order stored in memory on big-endian systems, and is reversed
+on little-endian systems.
+
+Passed by reference.
+
+Examples for a little-endian machine, given &(u32)0x67503030::
+
+	%p4ch	gP00 (0x67503030)
+	%p4cl	gP00 (0x67503030)
+	%p4cb	00Pg (0x30305067)
+	%p4cr	00Pg (0x30305067)
+
+Examples for a big-endian machine, given &(u32)0x67503030::
+
+	%p4ch	gP00 (0x67503030)
+	%p4cl	00Pg (0x30305067)
+	%p4cb	gP00 (0x67503030)
+	%p4cr	00Pg (0x30305067)
+
 Rust
 ----
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 4112729fc23a..064156d69e75 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6728,6 +6728,12 @@ S:	Supported
 T:	git https://gitlab.freedesktop.org/drm/misc/kernel.git
 F:	drivers/gpu/drm/sun4i/sun8i*
 
+DRM DRIVER FOR APPLE TOUCH BARS
+M:	Kerem Karabay <kekrby@gmail.com>
+L:	dri-devel@lists.freedesktop.org
+S:	Maintained
+F:	drivers/gpu/drm/tiny/appletbdrm.c
+
 DRM DRIVER FOR ARM PL111 CLCD
 S:	Orphan
 T:	git https://gitlab.freedesktop.org/drm/misc/kernel.git
@@ -9733,6 +9739,12 @@ F:	include/linux/pm.h
 F:	include/linux/suspend.h
 F:	kernel/power/
 
+HID APPLE TOUCH BAR DRIVERS
+M:	Kerem Karabay <kekrby@gmail.com>
+L:	linux-input@vger.kernel.org
+S:	Maintained
+F:	drivers/hid/hid-appletb-*
+
 HID CORE LAYER
 M:	Jiri Kosina <jikos@kernel.org>
 M:	Benjamin Tissoires <bentiss@kernel.org>
diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
index ff6f260433a1..674b9db7a1ef 100644
--- a/drivers/acpi/video_detect.c
+++ b/drivers/acpi/video_detect.c
@@ -541,6 +541,14 @@ static const struct dmi_system_id video_detect_dmi_table[] = {
 		DMI_MATCH(DMI_PRODUCT_NAME, "iMac12,2"),
 		},
 	},
+	{
+	 .callback = video_detect_force_native,
+	 /* Apple MacBook Air 9,1 */
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+		DMI_MATCH(DMI_PRODUCT_NAME, "MacBookAir9,1"),
+		},
+	},
 	{
 	 /* https://bugzilla.redhat.com/show_bug.cgi?id=1217249 */
 	 .callback = video_detect_force_native,
@@ -550,6 +558,14 @@ static const struct dmi_system_id video_detect_dmi_table[] = {
 		DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro12,1"),
 		},
 	},
+	{
+	 .callback = video_detect_force_native,
+	 /* Apple MacBook Pro 16,2 */
+	 .matches = {
+		DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+		DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro16,2"),
+		},
+	},
 	{
 	 .callback = video_detect_force_native,
 	 /* Dell Inspiron N4010 */
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index 06f0428a723c..1f32d6cf98d6 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -76,7 +76,7 @@ lib-$(CONFIG_EFI_GENERIC_STUB)	+= efi-stub.o string.o intrinsics.o systable.o \
 
 lib-$(CONFIG_ARM)		+= arm32-stub.o
 lib-$(CONFIG_ARM64)		+= kaslr.o arm64.o arm64-stub.o smbios.o
-lib-$(CONFIG_X86)		+= x86-stub.o
+lib-$(CONFIG_X86)		+= x86-stub.o smbios.o
 lib-$(CONFIG_X86_64)		+= x86-5lvl.o
 lib-$(CONFIG_RISCV)		+= kaslr.o riscv.o riscv-stub.o
 lib-$(CONFIG_LOONGARCH)		+= loongarch.o loongarch-stub.o
diff --git a/drivers/firmware/efi/libstub/arm64.c b/drivers/firmware/efi/libstub/arm64.c
index 446e35eaf3d9..e57cd3de0a00 100644
--- a/drivers/firmware/efi/libstub/arm64.c
+++ b/drivers/firmware/efi/libstub/arm64.c
@@ -39,8 +39,7 @@ static bool system_needs_vamap(void)
 		static char const emag[] = "eMAG";
 
 	default:
-		version = efi_get_smbios_string(&record->header, 4,
-						processor_version);
+		version = efi_get_smbios_string(record, processor_version);
 		if (!version || (strncmp(version, altra, sizeof(altra) - 1) &&
 				 strncmp(version, emag, sizeof(emag) - 1)))
 			break;
diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h
index 27abb4ce0291..d33ccbc4a2c6 100644
--- a/drivers/firmware/efi/libstub/efistub.h
+++ b/drivers/firmware/efi/libstub/efistub.h
@@ -1204,14 +1204,13 @@ struct efi_smbios_type4_record {
 	u16				thread_enabled;
 };
 
-#define efi_get_smbios_string(__record, __type, __name) ({		\
-	int off = offsetof(struct efi_smbios_type ## __type ## _record,	\
-			   __name);					\
-	__efi_get_smbios_string((__record), __type, off);		\
+#define efi_get_smbios_string(__record, __field) ({			\
+	__typeof__(__record) __rec = __record;				\
+	__efi_get_smbios_string(&__rec->header, &__rec->__field);	\
 })
 
 const u8 *__efi_get_smbios_string(const struct efi_smbios_record *record,
-				  u8 type, int offset);
+				  const u8 *offset);
 
 void efi_remap_image(unsigned long image_base, unsigned alloc_size,
 		     unsigned long code_size);
diff --git a/drivers/firmware/efi/libstub/smbios.c b/drivers/firmware/efi/libstub/smbios.c
index c217de2cc8d5..f31410d7e7e1 100644
--- a/drivers/firmware/efi/libstub/smbios.c
+++ b/drivers/firmware/efi/libstub/smbios.c
@@ -6,20 +6,31 @@
 
 #include "efistub.h"
 
-typedef struct efi_smbios_protocol efi_smbios_protocol_t;
-
-struct efi_smbios_protocol {
-	efi_status_t (__efiapi *add)(efi_smbios_protocol_t *, efi_handle_t,
-				     u16 *, struct efi_smbios_record *);
-	efi_status_t (__efiapi *update_string)(efi_smbios_protocol_t *, u16 *,
-					       unsigned long *, u8 *);
-	efi_status_t (__efiapi *remove)(efi_smbios_protocol_t *, u16);
-	efi_status_t (__efiapi *get_next)(efi_smbios_protocol_t *, u16 *, u8 *,
-					  struct efi_smbios_record **,
-					  efi_handle_t *);
-
-	u8 major_version;
-	u8 minor_version;
+typedef union efi_smbios_protocol efi_smbios_protocol_t;
+
+union efi_smbios_protocol {
+	struct {
+		efi_status_t (__efiapi *add)(efi_smbios_protocol_t *, efi_handle_t,
+					     u16 *, struct efi_smbios_record *);
+		efi_status_t (__efiapi *update_string)(efi_smbios_protocol_t *, u16 *,
+						       unsigned long *, u8 *);
+		efi_status_t (__efiapi *remove)(efi_smbios_protocol_t *, u16);
+		efi_status_t (__efiapi *get_next)(efi_smbios_protocol_t *, u16 *, u8 *,
+						  struct efi_smbios_record **,
+						  efi_handle_t *);
+
+		u8 major_version;
+		u8 minor_version;
+	};
+	struct {
+		u32 add;
+		u32 update_string;
+		u32 remove;
+		u32 get_next;
+
+		u8 major_version;
+		u8 minor_version;
+	} mixed_mode;
 };
 
 const struct efi_smbios_record *efi_get_smbios_record(u8 type)
@@ -38,7 +49,7 @@ const struct efi_smbios_record *efi_get_smbios_record(u8 type)
 }
 
 const u8 *__efi_get_smbios_string(const struct efi_smbios_record *record,
-				  u8 type, int offset)
+				  const u8 *offset)
 {
 	const u8 *strtable;
 
@@ -46,7 +57,7 @@ const u8 *__efi_get_smbios_string(const struct efi_smbios_record *record,
 		return NULL;
 
 	strtable = (u8 *)record + record->length;
-	for (int i = 1; i < ((u8 *)record)[offset]; i++) {
+	for (int i = 1; i < *offset; i++) {
 		int len = strlen(strtable);
 
 		if (!len)
diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c
index 99d39eda5134..0a2342c0bc16 100644
--- a/drivers/firmware/efi/libstub/x86-stub.c
+++ b/drivers/firmware/efi/libstub/x86-stub.c
@@ -225,6 +225,68 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params)
 	}
 }
 
+static bool apple_match_product_name(void)
+{
+	static const char type1_product_matches[][15] = {
+		"MacBookPro11,3",
+		"MacBookPro11,5",
+		"MacBookPro13,3",
+		"MacBookPro14,3",
+		"MacBookPro15,1",
+		"MacBookPro15,3",
+		"MacBookPro16,1",
+		"MacBookPro16,4",
+	};
+	const struct efi_smbios_type1_record *record;
+	const u8 *product;
+
+	record = (struct efi_smbios_type1_record *)efi_get_smbios_record(1);
+	if (!record)
+		return false;
+
+	product = efi_get_smbios_string(record, product_name);
+	if (!product)
+		return false;
+
+	for (int i = 0; i < ARRAY_SIZE(type1_product_matches); i++) {
+		if (!strcmp(product, type1_product_matches[i]))
+			return true;
+	}
+
+	return false;
+}
+
+static void apple_set_os(void)
+{
+	struct {
+		unsigned long version;
+		efi_status_t (__efiapi *set_os_version)(const char *);
+		efi_status_t (__efiapi *set_os_vendor)(const char *);
+	} *set_os;
+	efi_status_t status;
+
+	if (!efi_is_64bit() || !apple_match_product_name())
+		return;
+
+	status = efi_bs_call(locate_protocol, &APPLE_SET_OS_PROTOCOL_GUID, NULL,
+			     (void **)&set_os);
+	if (status != EFI_SUCCESS)
+		return;
+
+	if (set_os->version >= 2) {
+		status = set_os->set_os_vendor("Apple Inc.");
+		if (status != EFI_SUCCESS)
+			efi_err("Failed to set OS vendor via apple_set_os\n");
+	}
+
+	if (set_os->version > 0) {
+		/* The version being set doesn't seem to matter */
+		status = set_os->set_os_version("Mac OS X 10.9");
+		if (status != EFI_SUCCESS)
+			efi_err("Failed to set OS version via apple_set_os\n");
+	}
+}
+
 efi_status_t efi_adjust_memory_range_protection(unsigned long start,
 						unsigned long size)
 {
@@ -335,9 +397,12 @@ static const efi_char16_t apple[] = L"Apple";
 
 static void setup_quirks(struct boot_params *boot_params)
 {
-	if (IS_ENABLED(CONFIG_APPLE_PROPERTIES) &&
-	    !memcmp(efistub_fw_vendor(), apple, sizeof(apple)))
-		retrieve_apple_device_properties(boot_params);
+	if (!memcmp(efistub_fw_vendor(), apple, sizeof(apple))) {
+		if (IS_ENABLED(CONFIG_APPLE_PROPERTIES))
+			retrieve_apple_device_properties(boot_params);
+
+		apple_set_os();
+	}
 }
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index bb0b636d0d75..a05ed98da785 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2211,6 +2211,9 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
 	int ret, retry = 0, i;
 	bool supports_atomic = false;
 
+	if (vga_switcheroo_client_probe_defer(pdev))
+		return -EPROBE_DEFER;
+
 	/* skip devices which are owned by radeon */
 	for (i = 0; i < ARRAY_SIZE(amdgpu_unsupported_pciidlist); i++) {
 		if (amdgpu_unsupported_pciidlist[i] == pdev->device)
diff --git a/drivers/gpu/drm/drm_format_helper.c b/drivers/gpu/drm/drm_format_helper.c
index b1be458ed4dd..28c0e76a1e88 100644
--- a/drivers/gpu/drm/drm_format_helper.c
+++ b/drivers/gpu/drm/drm_format_helper.c
@@ -702,6 +702,57 @@ void drm_fb_xrgb8888_to_rgb888(struct iosys_map *dst, const unsigned int *dst_pi
 }
 EXPORT_SYMBOL(drm_fb_xrgb8888_to_rgb888);
 
+static void drm_fb_xrgb8888_to_bgr888_line(void *dbuf, const void *sbuf, unsigned int pixels)
+{
+	u8 *dbuf8 = dbuf;
+	const __le32 *sbuf32 = sbuf;
+	unsigned int x;
+	u32 pix;
+
+	for (x = 0; x < pixels; x++) {
+		pix = le32_to_cpu(sbuf32[x]);
+		/* write red-green-blue to output in little endianness */
+		*dbuf8++ = (pix & 0x00FF0000) >> 16;
+		*dbuf8++ = (pix & 0x0000FF00) >> 8;
+		*dbuf8++ = (pix & 0x000000FF) >> 0;
+	}
+}
+
+/**
+ * drm_fb_xrgb8888_to_bgr888 - Convert XRGB8888 to BGR888 clip buffer
+ * @dst: Array of BGR888 destination buffers
+ * @dst_pitch: Array of numbers of bytes between the start of two consecutive scanlines
+ *             within @dst; can be NULL if scanlines are stored next to each other.
+ * @src: Array of XRGB8888 source buffers
+ * @fb: DRM framebuffer
+ * @clip: Clip rectangle area to copy
+ * @state: Transform and conversion state
+ *
+ * This function copies parts of a framebuffer to display memory and converts the
+ * color format during the process. Destination and framebuffer formats must match. The
+ * parameters @dst, @dst_pitch and @src refer to arrays. Each array must have at
+ * least as many entries as there are planes in @fb's format. Each entry stores the
+ * value for the format's respective color plane at the same index.
+ *
+ * This function does not apply clipping on @dst (i.e. the destination is at the
+ * top-left corner).
+ *
+ * Drivers can use this function for BGR888 devices that don't natively
+ * support XRGB8888.
+ */
+void drm_fb_xrgb8888_to_bgr888(struct iosys_map *dst, const unsigned int *dst_pitch,
+			       const struct iosys_map *src, const struct drm_framebuffer *fb,
+			       const struct drm_rect *clip, struct drm_format_conv_state *state)
+{
+	static const u8 dst_pixsize[DRM_FORMAT_MAX_PLANES] = {
+		3,
+	};
+
+	drm_fb_xfrm(dst, dst_pitch, dst_pixsize, src, fb, clip, false, state,
+		    drm_fb_xrgb8888_to_bgr888_line);
+}
+EXPORT_SYMBOL(drm_fb_xrgb8888_to_bgr888);
+
 static void drm_fb_xrgb8888_to_argb8888_line(void *dbuf, const void *sbuf, unsigned int pixels)
 {
 	__le32 *dbuf32 = dbuf;
@@ -1035,6 +1086,9 @@ int drm_fb_blit(struct iosys_map *dst, const unsigned int *dst_pitch, uint32_t d
 		} else if (dst_format == DRM_FORMAT_RGB888) {
 			drm_fb_xrgb8888_to_rgb888(dst, dst_pitch, src, fb, clip, state);
 			return 0;
+		} else if (dst_format == DRM_FORMAT_BGR888) {
+			drm_fb_xrgb8888_to_bgr888(dst, dst_pitch, src, fb, clip, state);
+			return 0;
 		} else if (dst_format == DRM_FORMAT_ARGB8888) {
 			drm_fb_xrgb8888_to_argb8888(dst, dst_pitch, src, fb, clip, state);
 			return 0;
diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c
index 6bff169fa8d4..8d80ae00b838 100644
--- a/drivers/gpu/drm/i915/display/intel_ddi.c
+++ b/drivers/gpu/drm/i915/display/intel_ddi.c
@@ -4648,6 +4648,7 @@ intel_ddi_init_hdmi_connector(struct intel_digital_port *dig_port)
 
 static bool intel_ddi_a_force_4_lanes(struct intel_digital_port *dig_port)
 {
+	struct intel_display *display = to_intel_display(dig_port);
 	struct drm_i915_private *dev_priv = to_i915(dig_port->base.base.dev);
 
 	if (dig_port->base.port != PORT_A)
@@ -4656,6 +4657,9 @@ static bool intel_ddi_a_force_4_lanes(struct intel_digital_port *dig_port)
 	if (dig_port->saved_port_bits & DDI_A_4_LANES)
 		return false;
 
+	if (intel_has_quirk(display, QUIRK_DDI_A_FORCE_4_LANES))
+		return true;
+
 	/* Broxton/Geminilake: Bspec says that DDI_A_4_LANES is the only
 	 *                     supported configuration
 	 */
diff --git a/drivers/gpu/drm/i915/display/intel_fbdev.c b/drivers/gpu/drm/i915/display/intel_fbdev.c
index bda702c2cab8..1647e141ae78 100644
--- a/drivers/gpu/drm/i915/display/intel_fbdev.c
+++ b/drivers/gpu/drm/i915/display/intel_fbdev.c
@@ -196,10 +196,10 @@ static int intelfb_create(struct drm_fb_helper *helper,
 		return ret;
 
 	if (intel_fb &&
-	    (sizes->fb_width > intel_fb->base.width ||
-	     sizes->fb_height > intel_fb->base.height)) {
+	    (sizes->fb_width != intel_fb->base.width ||
+	     sizes->fb_height != intel_fb->base.height)) {
 		drm_dbg_kms(&dev_priv->drm,
-			    "BIOS fb too small (%dx%d), we require (%dx%d),"
+			    "BIOS fb not valid (%dx%d), we require (%dx%d),"
 			    " releasing it\n",
 			    intel_fb->base.width, intel_fb->base.height,
 			    sizes->fb_width, sizes->fb_height);
diff --git a/drivers/gpu/drm/i915/display/intel_quirks.c b/drivers/gpu/drm/i915/display/intel_quirks.c
index dfd8b4960e6d..7232f9acd0a0 100644
--- a/drivers/gpu/drm/i915/display/intel_quirks.c
+++ b/drivers/gpu/drm/i915/display/intel_quirks.c
@@ -64,6 +64,18 @@ static void quirk_increase_ddi_disabled_time(struct intel_display *display)
 	drm_info(display->drm, "Applying Increase DDI Disabled quirk\n");
 }
 
+/*
+ * In some cases, the firmware might not set the lane count to 4 (for example,
+ * when booting in some dual GPU Macs with the dGPU as the default GPU), this
+ * quirk is used to force it as otherwise it might not be possible to compute a
+ * valid link configuration.
+ */
+static void quirk_ddi_a_force_4_lanes(struct intel_display *display)
+{
+	intel_set_quirk(display, QUIRK_DDI_A_FORCE_4_LANES);
+	drm_info(display->drm, "Applying DDI A Forced 4 Lanes quirk\n");
+}
+
 static void quirk_no_pps_backlight_power_hook(struct intel_display *display)
 {
 	intel_set_quirk(display, QUIRK_NO_PPS_BACKLIGHT_POWER_HOOK);
@@ -229,6 +241,9 @@ static struct intel_quirk intel_quirks[] = {
 	{ 0x3184, 0x1019, 0xa94d, quirk_increase_ddi_disabled_time },
 	/* HP Notebook - 14-r206nv */
 	{ 0x0f31, 0x103c, 0x220f, quirk_invert_brightness },
+
+	/* Apple MacBookPro15,1 */
+	{ 0x3e9b, 0x106b, 0x0176, quirk_ddi_a_force_4_lanes },
 };
 
 static struct intel_dpcd_quirk intel_dpcd_quirks[] = {
diff --git a/drivers/gpu/drm/i915/display/intel_quirks.h b/drivers/gpu/drm/i915/display/intel_quirks.h
index cafdebda7535..58f60b870781 100644
--- a/drivers/gpu/drm/i915/display/intel_quirks.h
+++ b/drivers/gpu/drm/i915/display/intel_quirks.h
@@ -19,6 +19,7 @@ enum intel_quirk_id {
 	QUIRK_INVERT_BRIGHTNESS,
 	QUIRK_LVDS_SSC_DISABLE,
 	QUIRK_NO_PPS_BACKLIGHT_POWER_HOOK,
+	QUIRK_DDI_A_FORCE_4_LANES,
 	QUIRK_FW_SYNC_LEN,
 };
 
diff --git a/drivers/gpu/drm/tests/drm_format_helper_test.c b/drivers/gpu/drm/tests/drm_format_helper_test.c
index 08992636ec05..35cd3405d045 100644
--- a/drivers/gpu/drm/tests/drm_format_helper_test.c
+++ b/drivers/gpu/drm/tests/drm_format_helper_test.c
@@ -60,6 +60,11 @@ struct convert_to_rgb888_result {
 	const u8 expected[TEST_BUF_SIZE];
 };
 
+struct convert_to_bgr888_result {
+	unsigned int dst_pitch;
+	const u8 expected[TEST_BUF_SIZE];
+};
+
 struct convert_to_argb8888_result {
 	unsigned int dst_pitch;
 	const u32 expected[TEST_BUF_SIZE];
@@ -107,6 +112,7 @@ struct convert_xrgb8888_case {
 	struct convert_to_argb1555_result argb1555_result;
 	struct convert_to_rgba5551_result rgba5551_result;
 	struct convert_to_rgb888_result rgb888_result;
+	struct convert_to_bgr888_result bgr888_result;
 	struct convert_to_argb8888_result argb8888_result;
 	struct convert_to_xrgb2101010_result xrgb2101010_result;
 	struct convert_to_argb2101010_result argb2101010_result;
@@ -151,6 +157,10 @@ static struct convert_xrgb8888_case convert_xrgb8888_cases[] = {
 			.dst_pitch = TEST_USE_DEFAULT_PITCH,
 			.expected = { 0x00, 0x00, 0xFF },
 		},
+		.bgr888_result = {
+			.dst_pitch = TEST_USE_DEFAULT_PITCH,
+			.expected = { 0xFF, 0x00, 0x00 },
+		},
 		.argb8888_result = {
 			.dst_pitch = TEST_USE_DEFAULT_PITCH,
 			.expected = { 0xFFFF0000 },
@@ -217,6 +227,10 @@ static struct convert_xrgb8888_case convert_xrgb8888_cases[] = {
 			.dst_pitch = TEST_USE_DEFAULT_PITCH,
 			.expected = { 0x00, 0x00, 0xFF },
 		},
+		.bgr888_result = {
+			.dst_pitch = TEST_USE_DEFAULT_PITCH,
+			.expected = { 0xFF, 0x00, 0x00 },
+		},
 		.argb8888_result = {
 			.dst_pitch = TEST_USE_DEFAULT_PITCH,
 			.expected = { 0xFFFF0000 },
@@ -330,6 +344,15 @@ static struct convert_xrgb8888_case convert_xrgb8888_cases[] = {
 				0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0x00,
 			},
 		},
+		.bgr888_result = {
+			.dst_pitch = TEST_USE_DEFAULT_PITCH,
+			.expected = {
+				0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00,
+				0xFF, 0x00, 0x00, 0x00, 0xFF, 0x00,
+				0x00, 0x00, 0xFF, 0xFF, 0x00, 0xFF,
+				0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF,
+			},
+		},
 		.argb8888_result = {
 			.dst_pitch = TEST_USE_DEFAULT_PITCH,
 			.expected = {
@@ -468,6 +491,17 @@ static struct convert_xrgb8888_case convert_xrgb8888_cases[] = {
 				0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 			},
 		},
+		.bgr888_result = {
+			.dst_pitch = 15,
+			.expected = {
+				0x0E, 0x44, 0x9C, 0x11, 0x4D, 0x05, 0xA8, 0xF3, 0x03,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0x6C, 0xF0, 0x73, 0x0E, 0x44, 0x9C, 0x11, 0x4D, 0x05,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+				0xA8, 0x03, 0x03, 0x6C, 0xF0, 0x73, 0x0E, 0x44, 0x9C,
+				0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+			},
+		},
 		.argb8888_result = {
 			.dst_pitch = 20,
 			.expected = {
@@ -914,6 +948,52 @@ static void drm_test_fb_xrgb8888_to_rgb888(struct kunit *test)
 	KUNIT_EXPECT_MEMEQ(test, buf, result->expected, dst_size);
 }
 
+static void drm_test_fb_xrgb8888_to_bgr888(struct kunit *test)
+{
+	const struct convert_xrgb8888_case *params = test->param_value;
+	const struct convert_to_bgr888_result *result = &params->bgr888_result;
+	size_t dst_size;
+	u8 *buf = NULL;
+	__le32 *xrgb8888 = NULL;
+	struct iosys_map dst, src;
+
+	struct drm_framebuffer fb = {
+		.format = drm_format_info(DRM_FORMAT_XRGB8888),
+		.pitches = { params->pitch, 0, 0 },
+	};
+
+	dst_size = conversion_buf_size(DRM_FORMAT_BGR888, result->dst_pitch,
+				       &params->clip, 0);
+	KUNIT_ASSERT_GT(test, dst_size, 0);
+
+	buf = kunit_kzalloc(test, dst_size, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buf);
+	iosys_map_set_vaddr(&dst, buf);
+
+	xrgb8888 = cpubuf_to_le32(test, params->xrgb8888, TEST_BUF_SIZE);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, xrgb8888);
+	iosys_map_set_vaddr(&src, xrgb8888);
+
+	/*
+	 * BGR888 expected results are already in little-endian
+	 * order, so there's no need to convert the test output.
+	 */
+	drm_fb_xrgb8888_to_bgr888(&dst, &result->dst_pitch, &src, &fb, &params->clip,
+				  &fmtcnv_state);
+	KUNIT_EXPECT_MEMEQ(test, buf, result->expected, dst_size);
+
+	buf = dst.vaddr; /* restore original value of buf */
+	memset(buf, 0, dst_size);
+
+	int blit_result = 0;
+
+	blit_result = drm_fb_blit(&dst, &result->dst_pitch, DRM_FORMAT_BGR888, &src, &fb, &params->clip,
+				  &fmtcnv_state);
+
+	KUNIT_EXPECT_FALSE(test, blit_result);
+	KUNIT_EXPECT_MEMEQ(test, buf, result->expected, dst_size);
+}
+
 static void drm_test_fb_xrgb8888_to_argb8888(struct kunit *test)
 {
 	const struct convert_xrgb8888_case *params = test->param_value;
@@ -1851,6 +1931,7 @@ static struct kunit_case drm_format_helper_test_cases[] = {
 	KUNIT_CASE_PARAM(drm_test_fb_xrgb8888_to_argb1555, convert_xrgb8888_gen_params),
 	KUNIT_CASE_PARAM(drm_test_fb_xrgb8888_to_rgba5551, convert_xrgb8888_gen_params),
 	KUNIT_CASE_PARAM(drm_test_fb_xrgb8888_to_rgb888, convert_xrgb8888_gen_params),
+	KUNIT_CASE_PARAM(drm_test_fb_xrgb8888_to_bgr888, convert_xrgb8888_gen_params),
 	KUNIT_CASE_PARAM(drm_test_fb_xrgb8888_to_argb8888, convert_xrgb8888_gen_params),
 	KUNIT_CASE_PARAM(drm_test_fb_xrgb8888_to_xrgb2101010, convert_xrgb8888_gen_params),
 	KUNIT_CASE_PARAM(drm_test_fb_xrgb8888_to_argb2101010, convert_xrgb8888_gen_params),
diff --git a/drivers/gpu/drm/tiny/Kconfig b/drivers/gpu/drm/tiny/Kconfig
index f6889f649bc1..559a97bce12c 100644
--- a/drivers/gpu/drm/tiny/Kconfig
+++ b/drivers/gpu/drm/tiny/Kconfig
@@ -1,5 +1,17 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
+config DRM_APPLETBDRM
+	tristate "DRM support for Apple Touch Bars"
+	depends on DRM && USB && MMU
+	select DRM_KMS_HELPER
+	select DRM_GEM_SHMEM_HELPER
+	help
+	  Say Y here if you want support for the display of Touch Bars on x86
+	  MacBook Pros.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called appletbdrm.
+
 config DRM_ARCPGU
 	tristate "ARC PGU"
 	depends on DRM && OF
diff --git a/drivers/gpu/drm/tiny/Makefile b/drivers/gpu/drm/tiny/Makefile
index 76dde89a044b..9a1b412e764a 100644
--- a/drivers/gpu/drm/tiny/Makefile
+++ b/drivers/gpu/drm/tiny/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
+obj-$(CONFIG_DRM_APPLETBDRM)		+= appletbdrm.o
 obj-$(CONFIG_DRM_ARCPGU)		+= arcpgu.o
 obj-$(CONFIG_DRM_BOCHS)			+= bochs.o
 obj-$(CONFIG_DRM_CIRRUS_QEMU)		+= cirrus.o
diff --git a/drivers/gpu/drm/tiny/appletbdrm.c b/drivers/gpu/drm/tiny/appletbdrm.c
new file mode 100644
index 000000000000..b9440ce0064e
--- /dev/null
+++ b/drivers/gpu/drm/tiny/appletbdrm.c
@@ -0,0 +1,624 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Apple Touch Bar DRM Driver
+ *
+ * Copyright (c) 2023 Kerem Karabay <kekrby@gmail.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <asm/unaligned.h>
+
+#include <linux/usb.h>
+#include <linux/module.h>
+
+#include <drm/drm_drv.h>
+#include <drm/drm_fourcc.h>
+#include <drm/drm_probe_helper.h>
+#include <drm/drm_atomic_helper.h>
+#include <drm/drm_damage_helper.h>
+#include <drm/drm_format_helper.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_gem_atomic_helper.h>
+#include <drm/drm_simple_kms_helper.h>
+#include <drm/drm_gem_framebuffer_helper.h>
+
+#define _APPLETBDRM_FOURCC(s)		(((s)[0] << 24) | ((s)[1] << 16) | ((s)[2] << 8) | (s)[3])
+#define APPLETBDRM_FOURCC(s)		_APPLETBDRM_FOURCC(#s)
+
+#define APPLETBDRM_PIXEL_FORMAT		APPLETBDRM_FOURCC(RGBA) /* The actual format is BGR888 */
+#define APPLETBDRM_BITS_PER_PIXEL	24
+
+#define APPLETBDRM_MSG_CLEAR_DISPLAY	APPLETBDRM_FOURCC(CLRD)
+#define APPLETBDRM_MSG_GET_INFORMATION	APPLETBDRM_FOURCC(GINF)
+#define APPLETBDRM_MSG_UPDATE_COMPLETE	APPLETBDRM_FOURCC(UDCL)
+#define APPLETBDRM_MSG_SIGNAL_READINESS	APPLETBDRM_FOURCC(REDY)
+
+#define APPLETBDRM_BULK_MSG_TIMEOUT	1000
+
+#define drm_to_adev(_drm)		container_of(_drm, struct appletbdrm_device, drm)
+#define adev_to_udev(adev)		interface_to_usbdev(to_usb_interface(adev->dev))
+
+struct appletbdrm_device {
+	struct device *dev;
+
+	u8 in_ep;
+	u8 out_ep;
+
+	u32 width;
+	u32 height;
+
+	struct drm_device drm;
+	struct drm_display_mode mode;
+	struct drm_connector connector;
+	struct drm_simple_display_pipe pipe;
+
+	bool readiness_signal_received;
+};
+
+struct appletbdrm_request_header {
+	__le16 unk_00;
+	__le16 unk_02;
+	__le32 unk_04;
+	__le32 unk_08;
+	__le32 size;
+} __packed;
+
+struct appletbdrm_response_header {
+	u8 unk_00[16];
+	u32 msg;
+} __packed;
+
+struct appletbdrm_simple_request {
+	struct appletbdrm_request_header header;
+	u32 msg;
+	u8 unk_14[8];
+	__le32 size;
+} __packed;
+
+struct appletbdrm_information {
+	struct appletbdrm_response_header header;
+	u8 unk_14[12];
+	__le32 width;
+	__le32 height;
+	u8 bits_per_pixel;
+	__le32 bytes_per_row;
+	__le32 orientation;
+	__le32 bitmap_info;
+	u32 pixel_format;
+	__le32 width_inches;	/* floating point */
+	__le32 height_inches;	/* floating point */
+} __packed;
+
+struct appletbdrm_frame {
+	__le16 begin_x;
+	__le16 begin_y;
+	__le16 width;
+	__le16 height;
+	__le32 buf_size;
+	u8 buf[];
+} __packed;
+
+struct appletbdrm_fb_request_footer {
+	u8 unk_00[12];
+	__le32 unk_0c;
+	u8 unk_10[12];
+	__le32 unk_1c;
+	__le64 timestamp;
+	u8 unk_28[12];
+	__le32 unk_34;
+	u8 unk_38[20];
+	__le32 unk_4c;
+} __packed;
+
+struct appletbdrm_fb_request {
+	struct appletbdrm_request_header header;
+	__le16 unk_10;
+	u8 msg_id;
+	u8 unk_13[29];
+	/*
+	 * Contents of `data`:
+	 * - struct appletbdrm_frame frames[];
+	 * - struct appletbdrm_fb_request_footer footer;
+	 * - padding to make the total size a multiple of 16
+	 */
+	u8 data[];
+} __packed;
+
+struct appletbdrm_fb_request_response {
+	struct appletbdrm_response_header header;
+	u8 unk_14[12];
+	__le64 timestamp;
+} __packed;
+
+static int appletbdrm_send_request(struct appletbdrm_device *adev,
+				   struct appletbdrm_request_header *request, size_t size)
+{
+	struct usb_device *udev = adev_to_udev(adev);
+	struct drm_device *drm = &adev->drm;
+	int ret, actual_size;
+
+	ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, adev->out_ep),
+			   request, size, &actual_size, APPLETBDRM_BULK_MSG_TIMEOUT);
+	if (ret) {
+		drm_err(drm, "Failed to send message (%pe)\n", ERR_PTR(ret));
+		return ret;
+	}
+
+	if (actual_size != size) {
+		drm_err(drm, "Actual size (%d) doesn't match expected size (%lu)\n",
+			actual_size, size);
+		return -EIO;
+	}
+
+	return ret;
+}
+
+static int appletbdrm_read_response(struct appletbdrm_device *adev,
+				    struct appletbdrm_response_header *response,
+				    size_t size, u32 expected_response)
+{
+	struct usb_device *udev = adev_to_udev(adev);
+	struct drm_device *drm = &adev->drm;
+	int ret, actual_size;
+
+retry:
+	ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, adev->in_ep),
+			   response, size, &actual_size, APPLETBDRM_BULK_MSG_TIMEOUT);
+	if (ret) {
+		drm_err(drm, "Failed to read response (%pe)\n", ERR_PTR(ret));
+		return ret;
+	}
+
+	/*
+	 * The device responds to the first request sent in a particular
+	 * timeframe after the USB device configuration is set with a readiness
+	 * signal, in which case the response should be read again
+	 */
+	if (response->msg == APPLETBDRM_MSG_SIGNAL_READINESS) {
+		if (!adev->readiness_signal_received) {
+			adev->readiness_signal_received = true;
+			goto retry;
+		}
+
+		drm_err(drm, "Encountered unexpected readiness signal\n");
+		return -EIO;
+	}
+
+	if (actual_size != size) {
+		drm_err(drm, "Actual size (%d) doesn't match expected size (%lu)\n",
+			actual_size, size);
+		return -EIO;
+	}
+
+	if (response->msg != expected_response) {
+		drm_err(drm, "Unexpected response from device (expected %p4ch found %p4ch)\n",
+			&expected_response, &response->msg);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int appletbdrm_send_msg(struct appletbdrm_device *adev, u32 msg)
+{
+	struct appletbdrm_simple_request *request;
+	int ret;
+
+	request = kzalloc(sizeof(*request), GFP_KERNEL);
+	if (!request)
+		return -ENOMEM;
+
+	request->header.unk_00 = cpu_to_le16(2);
+	request->header.unk_02 = cpu_to_le16(0x1512);
+	request->header.size = cpu_to_le32(sizeof(*request) - sizeof(request->header));
+	request->msg = msg;
+	request->size = request->header.size;
+
+	ret = appletbdrm_send_request(adev, &request->header, sizeof(*request));
+
+	kfree(request);
+
+	return ret;
+}
+
+static int appletbdrm_clear_display(struct appletbdrm_device *adev)
+{
+	return appletbdrm_send_msg(adev, APPLETBDRM_MSG_CLEAR_DISPLAY);
+}
+
+static int appletbdrm_signal_readiness(struct appletbdrm_device *adev)
+{
+	return appletbdrm_send_msg(adev, APPLETBDRM_MSG_SIGNAL_READINESS);
+}
+
+static int appletbdrm_get_information(struct appletbdrm_device *adev)
+{
+	struct appletbdrm_information *info;
+	struct drm_device *drm = &adev->drm;
+	u8 bits_per_pixel;
+	u32 pixel_format;
+	int ret;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	ret = appletbdrm_send_msg(adev, APPLETBDRM_MSG_GET_INFORMATION);
+	if (ret)
+		return ret;
+
+	ret = appletbdrm_read_response(adev, &info->header, sizeof(*info),
+				       APPLETBDRM_MSG_GET_INFORMATION);
+	if (ret)
+		goto free_info;
+
+	bits_per_pixel = info->bits_per_pixel;
+	pixel_format = get_unaligned(&info->pixel_format);
+
+	adev->width = get_unaligned_le32(&info->width);
+	adev->height = get_unaligned_le32(&info->height);
+
+	if (bits_per_pixel != APPLETBDRM_BITS_PER_PIXEL) {
+		drm_err(drm, "Encountered unexpected bits per pixel value (%d)\n", bits_per_pixel);
+		ret = -EINVAL;
+		goto free_info;
+	}
+
+	if (pixel_format != APPLETBDRM_PIXEL_FORMAT) {
+		drm_err(drm, "Encountered unknown pixel format (%p4ch)\n", &pixel_format);
+		ret = -EINVAL;
+		goto free_info;
+	}
+
+free_info:
+	kfree(info);
+
+	return ret;
+}
+
+static u32 rect_size(struct drm_rect *rect)
+{
+	return drm_rect_width(rect) * drm_rect_height(rect) * (APPLETBDRM_BITS_PER_PIXEL / 8);
+}
+
+static int appletbdrm_flush_damage(struct appletbdrm_device *adev,
+				   struct drm_plane_state *old_state,
+				   struct drm_plane_state *state)
+{
+	struct drm_shadow_plane_state *shadow_plane_state = to_drm_shadow_plane_state(state);
+	struct appletbdrm_fb_request_response *response;
+	struct appletbdrm_fb_request_footer *footer;
+	struct drm_atomic_helper_damage_iter iter;
+	struct drm_framebuffer *fb = state->fb;
+	struct appletbdrm_fb_request *request;
+	struct drm_device *drm = &adev->drm;
+	struct appletbdrm_frame *frame;
+	u64 timestamp = ktime_get_ns();
+	struct drm_rect damage;
+	size_t frames_size = 0;
+	size_t request_size;
+	int ret;
+
+	drm_atomic_helper_damage_iter_init(&iter, old_state, state);
+	drm_atomic_for_each_plane_damage(&iter, &damage) {
+		frames_size += struct_size(frame, buf, rect_size(&damage));
+	}
+
+	if (!frames_size)
+		return 0;
+
+	request_size = ALIGN(sizeof(*request) + frames_size + sizeof(*footer), 16);
+
+	request = kzalloc(request_size, GFP_KERNEL);
+	if (!request)
+		return -ENOMEM;
+
+	response = kzalloc(sizeof(*response), GFP_KERNEL);
+	if (!response) {
+		ret = -ENOMEM;
+		goto free_request;
+	}
+
+	ret = drm_gem_fb_begin_cpu_access(fb, DMA_FROM_DEVICE);
+	if (ret) {
+		drm_err(drm, "Failed to start CPU framebuffer access (%pe)\n", ERR_PTR(ret));
+		goto free_response;
+	}
+
+	request->header.unk_00 = cpu_to_le16(2);
+	request->header.unk_02 = cpu_to_le16(0x12);
+	request->header.unk_04 = cpu_to_le32(9);
+	request->header.size = cpu_to_le32(request_size - sizeof(request->header));
+	request->unk_10 = cpu_to_le16(1);
+	request->msg_id = timestamp & 0xff;
+
+	frame = (struct appletbdrm_frame *)request->data;
+
+	drm_atomic_helper_damage_iter_init(&iter, old_state, state);
+	drm_atomic_for_each_plane_damage(&iter, &damage) {
+		struct iosys_map dst = IOSYS_MAP_INIT_VADDR(frame->buf);
+		u32 buf_size = rect_size(&damage);
+
+		/*
+		 * The coordinates need to be translated to the coordinate
+		 * system the device expects, see the comment in
+		 * appletbdrm_setup_mode_config
+		 */
+		frame->begin_x = cpu_to_le16(damage.y1);
+		frame->begin_y = cpu_to_le16(adev->height - damage.x2);
+		frame->width = cpu_to_le16(drm_rect_height(&damage));
+		frame->height = cpu_to_le16(drm_rect_width(&damage));
+		frame->buf_size = cpu_to_le32(buf_size);
+
+		ret = drm_fb_blit(&dst, NULL, DRM_FORMAT_BGR888,
+				  &shadow_plane_state->data[0], fb, &damage, &shadow_plane_state->fmtcnv_state);
+		if (ret) {
+			drm_err(drm, "Failed to copy damage clip (%pe)\n", ERR_PTR(ret));
+			goto end_fb_cpu_access;
+		}
+
+		frame = (void *)frame + struct_size(frame, buf, buf_size);
+	}
+
+	footer = (struct appletbdrm_fb_request_footer *)&request->data[frames_size];
+
+	footer->unk_0c = cpu_to_le32(0xfffe);
+	footer->unk_1c = cpu_to_le32(0x80001);
+	footer->unk_34 = cpu_to_le32(0x80002);
+	footer->unk_4c = cpu_to_le32(0xffff);
+	footer->timestamp = cpu_to_le64(timestamp);
+
+	ret = appletbdrm_send_request(adev, &request->header, request_size);
+	if (ret)
+		goto end_fb_cpu_access;
+
+	ret = appletbdrm_read_response(adev, &response->header, sizeof(*response),
+				       APPLETBDRM_MSG_UPDATE_COMPLETE);
+	if (ret)
+		goto end_fb_cpu_access;
+
+	if (response->timestamp != footer->timestamp) {
+		drm_err(drm, "Response timestamp (%llu) doesn't match request timestamp (%llu)\n",
+			le64_to_cpu(response->timestamp), timestamp);
+		goto end_fb_cpu_access;
+	}
+
+end_fb_cpu_access:
+	drm_gem_fb_end_cpu_access(fb, DMA_FROM_DEVICE);
+free_response:
+	kfree(response);
+free_request:
+	kfree(request);
+
+	return ret;
+}
+
+static int appletbdrm_connector_helper_get_modes(struct drm_connector *connector)
+{
+	struct appletbdrm_device *adev = drm_to_adev(connector->dev);
+
+	return drm_connector_helper_get_modes_fixed(connector, &adev->mode);
+}
+
+static enum drm_mode_status appletbdrm_pipe_mode_valid(struct drm_simple_display_pipe *pipe,
+						       const struct drm_display_mode *mode)
+{
+	struct drm_crtc *crtc = &pipe->crtc;
+	struct appletbdrm_device *adev = drm_to_adev(crtc->dev);
+
+	return drm_crtc_helper_mode_valid_fixed(crtc, mode, &adev->mode);
+}
+
+static void appletbdrm_pipe_disable(struct drm_simple_display_pipe *pipe)
+{
+	struct appletbdrm_device *adev = drm_to_adev(pipe->crtc.dev);
+	int idx;
+
+	if (!drm_dev_enter(&adev->drm, &idx))
+		return;
+
+	appletbdrm_clear_display(adev);
+
+	drm_dev_exit(idx);
+}
+
+static void appletbdrm_pipe_update(struct drm_simple_display_pipe *pipe,
+				   struct drm_plane_state *old_state)
+{
+	struct drm_crtc *crtc = &pipe->crtc;
+	struct appletbdrm_device *adev = drm_to_adev(crtc->dev);
+	int idx;
+
+	if (!crtc->state->active || !drm_dev_enter(&adev->drm, &idx))
+		return;
+
+	appletbdrm_flush_damage(adev, old_state, pipe->plane.state);
+
+	drm_dev_exit(idx);
+}
+
+static const u32 appletbdrm_formats[] = {
+	DRM_FORMAT_BGR888,
+	DRM_FORMAT_XRGB8888, /* emulated */
+};
+
+static const struct drm_mode_config_funcs appletbdrm_mode_config_funcs = {
+	.fb_create = drm_gem_fb_create_with_dirty,
+	.atomic_check = drm_atomic_helper_check,
+	.atomic_commit = drm_atomic_helper_commit,
+};
+
+static const struct drm_connector_funcs appletbdrm_connector_funcs = {
+	.reset = drm_atomic_helper_connector_reset,
+	.destroy = drm_connector_cleanup,
+	.fill_modes = drm_helper_probe_single_connector_modes,
+	.atomic_destroy_state = drm_atomic_helper_connector_destroy_state,
+	.atomic_duplicate_state = drm_atomic_helper_connector_duplicate_state,
+};
+
+static const struct drm_connector_helper_funcs appletbdrm_connector_helper_funcs = {
+	.get_modes = appletbdrm_connector_helper_get_modes,
+};
+
+static const struct drm_simple_display_pipe_funcs appletbdrm_pipe_funcs = {
+	DRM_GEM_SIMPLE_DISPLAY_PIPE_SHADOW_PLANE_FUNCS,
+	.update = appletbdrm_pipe_update,
+	.disable = appletbdrm_pipe_disable,
+	.mode_valid = appletbdrm_pipe_mode_valid,
+};
+
+DEFINE_DRM_GEM_FOPS(appletbdrm_drm_fops);
+
+static const struct drm_driver appletbdrm_drm_driver = {
+	DRM_GEM_SHMEM_DRIVER_OPS,
+	.name			= "appletbdrm",
+	.desc			= "Apple Touch Bar DRM Driver",
+	.date			= "20230910",
+	.major			= 1,
+	.minor			= 0,
+	.driver_features	= DRIVER_MODESET | DRIVER_GEM | DRIVER_ATOMIC,
+	.fops			= &appletbdrm_drm_fops,
+};
+
+static int appletbdrm_setup_mode_config(struct appletbdrm_device *adev)
+{
+	struct drm_connector *connector = &adev->connector;
+	struct drm_device *drm = &adev->drm;
+	struct device *dev = adev->dev;
+	int ret;
+
+	ret = drmm_mode_config_init(drm);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to initialize mode configuration\n");
+
+	/*
+	 * The coordinate system used by the device is different from the
+	 * coordinate system of the framebuffer in that the x and y axes are
+	 * swapped, and that the y axis is inverted; so what the device reports
+	 * as the height is actually the width of the framebuffer and vice
+	 * versa
+	 */
+	drm->mode_config.min_width = 0;
+	drm->mode_config.min_height = 0;
+	drm->mode_config.max_width = max(adev->height, DRM_SHADOW_PLANE_MAX_WIDTH);
+	drm->mode_config.max_height = max(adev->width, DRM_SHADOW_PLANE_MAX_HEIGHT);
+	drm->mode_config.preferred_depth = APPLETBDRM_BITS_PER_PIXEL;
+	drm->mode_config.funcs = &appletbdrm_mode_config_funcs;
+
+	adev->mode = (struct drm_display_mode) {
+		DRM_MODE_INIT(60, adev->height, adev->width,
+			      DRM_MODE_RES_MM(adev->height, 218),
+			      DRM_MODE_RES_MM(adev->width, 218))
+	};
+
+	ret = drm_connector_init(drm, connector,
+				 &appletbdrm_connector_funcs, DRM_MODE_CONNECTOR_USB);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to initialize connector\n");
+
+	drm_connector_helper_add(connector, &appletbdrm_connector_helper_funcs);
+
+	ret = drm_connector_set_panel_orientation(connector,
+						  DRM_MODE_PANEL_ORIENTATION_RIGHT_UP);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to set panel orientation\n");
+
+	connector->display_info.non_desktop = true;
+	ret = drm_object_property_set_value(&connector->base,
+					    drm->mode_config.non_desktop_property, true);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to set non-desktop property\n");
+
+	ret = drm_simple_display_pipe_init(drm, &adev->pipe, &appletbdrm_pipe_funcs,
+					   appletbdrm_formats, ARRAY_SIZE(appletbdrm_formats),
+					   NULL, &adev->connector);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to initialize simple display pipe\n");
+
+	drm_plane_enable_fb_damage_clips(&adev->pipe.plane);
+
+	drm_mode_config_reset(drm);
+
+	ret = drm_dev_register(drm, 0);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to register DRM device\n");
+
+	return 0;
+}
+
+static int appletbdrm_probe(struct usb_interface *intf,
+			    const struct usb_device_id *id)
+{
+	struct usb_endpoint_descriptor *bulk_in, *bulk_out;
+	struct device *dev = &intf->dev;
+	struct appletbdrm_device *adev;
+	int ret;
+
+	ret = usb_find_common_endpoints(intf->cur_altsetting, &bulk_in, &bulk_out, NULL, NULL);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to find bulk endpoints\n");
+
+	adev = devm_drm_dev_alloc(dev, &appletbdrm_drm_driver, struct appletbdrm_device, drm);
+	if (IS_ERR(adev))
+		return PTR_ERR(adev);
+
+	adev->dev = dev;
+	adev->in_ep = bulk_in->bEndpointAddress;
+	adev->out_ep = bulk_out->bEndpointAddress;
+
+	usb_set_intfdata(intf, adev);
+
+	ret = appletbdrm_get_information(adev);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to get display information\n");
+
+	ret = appletbdrm_signal_readiness(adev);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to signal readiness\n");
+
+	ret = appletbdrm_clear_display(adev);
+	if (ret)
+		return dev_err_probe(dev, ret, "Failed to clear display\n");
+
+	return appletbdrm_setup_mode_config(adev);
+}
+
+static void appletbdrm_disconnect(struct usb_interface *intf)
+{
+	struct appletbdrm_device *adev = usb_get_intfdata(intf);
+	struct drm_device *drm = &adev->drm;
+
+	drm_dev_unplug(drm);
+	drm_atomic_helper_shutdown(drm);
+}
+
+static void appletbdrm_shutdown(struct usb_interface *intf)
+{
+	struct appletbdrm_device *adev = usb_get_intfdata(intf);
+
+	/*
+	 * The framebuffer needs to be cleared on shutdown since its content
+	 * persists across boots
+	 */
+	drm_atomic_helper_shutdown(&adev->drm);
+}
+
+static const struct usb_device_id appletbdrm_usb_id_table[] = {
+	{ USB_DEVICE_INTERFACE_CLASS(0x05ac, 0x8302, USB_CLASS_AUDIO_VIDEO) },
+	{}
+};
+MODULE_DEVICE_TABLE(usb, appletbdrm_usb_id_table);
+
+static struct usb_driver appletbdrm_usb_driver = {
+	.name		= "appletbdrm",
+	.probe		= appletbdrm_probe,
+	.disconnect	= appletbdrm_disconnect,
+	.shutdown	= appletbdrm_shutdown,
+	.id_table	= appletbdrm_usb_id_table,
+};
+module_usb_driver(appletbdrm_usb_driver);
+
+MODULE_AUTHOR("Kerem Karabay <kekrby@gmail.com>");
+MODULE_DESCRIPTION("Apple Touch Bar DRM Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/gpu/vga/vga_switcheroo.c b/drivers/gpu/vga/vga_switcheroo.c
index 365e6ddbe90f..cf357cd3389d 100644
--- a/drivers/gpu/vga/vga_switcheroo.c
+++ b/drivers/gpu/vga/vga_switcheroo.c
@@ -438,12 +438,7 @@ find_active_client(struct list_head *head)
 bool vga_switcheroo_client_probe_defer(struct pci_dev *pdev)
 {
 	if ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) {
-		/*
-		 * apple-gmux is needed on pre-retina MacBook Pro
-		 * to probe the panel if pdev is the inactive GPU.
-		 */
-		if (apple_gmux_present() && pdev != vga_default_device() &&
-		    !vgasr_priv.handler_flags)
+		if (apple_gmux_present() && !vgasr_priv.handler_flags)
 			return true;
 	}
 
diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig
index 08446c89eff6..35ef5d4ef068 100644
--- a/drivers/hid/Kconfig
+++ b/drivers/hid/Kconfig
@@ -148,6 +148,27 @@ config HID_APPLEIR
 
 	Say Y here if you want support for Apple infrared remote control.
 
+config HID_APPLETB_BL
+	tristate "Apple Touch Bar Backlight"
+	depends on BACKLIGHT_CLASS_DEVICE
+	help
+	  Say Y here if you want support for the backlight of Touch Bars on x86
+	  MacBook Pros.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called hid-appletb-bl.
+
+config HID_APPLETB_KBD
+	tristate "Apple Touch Bar Keyboard Mode"
+	depends on USB_HID
+	help
+	  Say Y here if you want support for the keyboard mode (escape,
+	  function, media and brightness keys) of Touch Bars on x86 MacBook
+	  Pros.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called hid-appletb-kbd.
+
 config HID_ASUS
 	tristate "Asus"
 	depends on USB_HID
@@ -723,6 +744,7 @@ config HID_MULTITOUCH
 	  Say Y here if you have one of the following devices:
 	  - 3M PCT touch screens
 	  - ActionStar dual touch panels
+	  - Touch Bars on x86 MacBook Pros
 	  - Atmel panels
 	  - Cando dual touch panels
 	  - Chunghwa panels
diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile
index ce71b53ea6c5..fecec1d61393 100644
--- a/drivers/hid/Makefile
+++ b/drivers/hid/Makefile
@@ -29,6 +29,8 @@ obj-$(CONFIG_HID_ALPS)		+= hid-alps.o
 obj-$(CONFIG_HID_ACRUX)		+= hid-axff.o
 obj-$(CONFIG_HID_APPLE)		+= hid-apple.o
 obj-$(CONFIG_HID_APPLEIR)	+= hid-appleir.o
+obj-$(CONFIG_HID_APPLETB_BL)	+= hid-appletb-bl.o
+obj-$(CONFIG_HID_APPLETB_KBD)	+= hid-appletb-kbd.o
 obj-$(CONFIG_HID_CREATIVE_SB0540)	+= hid-creative-sb0540.o
 obj-$(CONFIG_HID_ASUS)		+= hid-asus.o
 obj-$(CONFIG_HID_AUREAL)	+= hid-aureal.o
diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c
index bd022e004356..6dedb84d7cc3 100644
--- a/drivers/hid/hid-apple.c
+++ b/drivers/hid/hid-apple.c
@@ -8,6 +8,8 @@
  *  Copyright (c) 2006-2007 Jiri Kosina
  *  Copyright (c) 2008 Jiri Slaby <jirislaby@gmail.com>
  *  Copyright (c) 2019 Paul Pawlowski <paul@mrarm.io>
+ *  Copyright (c) 2023 Orlando Chamberlain <orlandoch.dev@gmail.com>
+ *  Copyright (c) 2024 Aditya Garg <gargaditya08@live.com>
  */
 
 /*
@@ -23,6 +25,7 @@
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/leds.h>
+#include <dt-bindings/leds/common.h>
 
 #include "hid-ids.h"
 
@@ -38,12 +41,17 @@
 #define APPLE_RDESC_BATTERY	BIT(9)
 #define APPLE_BACKLIGHT_CTL	BIT(10)
 #define APPLE_IS_NON_APPLE	BIT(11)
+#define APPLE_MAGIC_BACKLIGHT	BIT(12)
 
 #define APPLE_FLAG_FKEY		0x01
 
 #define HID_COUNTRY_INTERNATIONAL_ISO	13
 #define APPLE_BATTERY_TIMEOUT_MS	60000
 
+#define HID_USAGE_MAGIC_BL			0xff00000f
+#define APPLE_MAGIC_REPORT_ID_POWER		3
+#define APPLE_MAGIC_REPORT_ID_BRIGHTNESS	1
+
 static unsigned int fnmode = 3;
 module_param(fnmode, uint, 0644);
 MODULE_PARM_DESC(fnmode, "Mode of fn key on Apple keyboards (0 = disabled, "
@@ -81,6 +89,12 @@ struct apple_sc_backlight {
 	struct hid_device *hdev;
 };
 
+struct apple_magic_backlight {
+	struct led_classdev cdev;
+	struct hid_report *brightness;
+	struct hid_report *power;
+};
+
 struct apple_sc {
 	struct hid_device *hdev;
 	unsigned long quirks;
@@ -822,6 +836,66 @@ static int apple_backlight_init(struct hid_device *hdev)
 	return ret;
 }
 
+static void apple_magic_backlight_report_set(struct hid_report *rep, s32 value, u8 rate)
+{
+	rep->field[0]->value[0] = value;
+	rep->field[1]->value[0] = 0x5e; /* Mimic Windows */
+	rep->field[1]->value[0] |= rate << 8;
+
+	hid_hw_request(rep->device, rep, HID_REQ_SET_REPORT);
+}
+
+static void apple_magic_backlight_set(struct apple_magic_backlight *backlight,
+				     int brightness, char rate)
+{
+	apple_magic_backlight_report_set(backlight->power, brightness ? 1 : 0, rate);
+	if (brightness)
+		apple_magic_backlight_report_set(backlight->brightness, brightness, rate);
+}
+
+static int apple_magic_backlight_led_set(struct led_classdev *led_cdev,
+					 enum led_brightness brightness)
+{
+	struct apple_magic_backlight *backlight = container_of(led_cdev,
+			struct apple_magic_backlight, cdev);
+
+	apple_magic_backlight_set(backlight, brightness, 1);
+	return 0;
+}
+
+static int apple_magic_backlight_init(struct hid_device *hdev)
+{
+	struct apple_magic_backlight *backlight;
+	struct hid_report_enum *report_enum;
+
+	/*
+	 * Ensure this usb endpoint is for the keyboard backlight, not touchbar
+	 * backlight.
+	 */
+	if (hdev->collection[0].usage != HID_USAGE_MAGIC_BL)
+		return -ENODEV;
+
+	backlight = devm_kzalloc(&hdev->dev, sizeof(*backlight), GFP_KERNEL);
+	if (!backlight)
+		return -ENOMEM;
+
+	report_enum = &hdev->report_enum[HID_FEATURE_REPORT];
+	backlight->brightness = report_enum->report_id_hash[APPLE_MAGIC_REPORT_ID_BRIGHTNESS];
+	backlight->power = report_enum->report_id_hash[APPLE_MAGIC_REPORT_ID_POWER];
+
+	if (!backlight->brightness || !backlight->power)
+		return -ENODEV;
+
+	backlight->cdev.name = ":white:" LED_FUNCTION_KBD_BACKLIGHT;
+	backlight->cdev.max_brightness = backlight->brightness->field[0]->logical_maximum;
+	backlight->cdev.brightness_set_blocking = apple_magic_backlight_led_set;
+
+	apple_magic_backlight_set(backlight, 0, 0);
+
+	return devm_led_classdev_register(&hdev->dev, &backlight->cdev);
+
+}
+
 static int apple_probe(struct hid_device *hdev,
 		const struct hid_device_id *id)
 {
@@ -860,7 +934,18 @@ static int apple_probe(struct hid_device *hdev,
 	if (quirks & APPLE_BACKLIGHT_CTL)
 		apple_backlight_init(hdev);
 
+	if (quirks & APPLE_MAGIC_BACKLIGHT) {
+		ret = apple_magic_backlight_init(hdev);
+		if (ret)
+			goto out_err;
+	}
+
 	return 0;
+
+out_err:
+	del_timer_sync(&asc->battery_timer);
+	hid_hw_stop(hdev);
+	return ret;
 }
 
 static void apple_remove(struct hid_device *hdev)
@@ -1073,6 +1158,8 @@ static const struct hid_device_id apple_devices[] = {
 		.driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK | APPLE_RDESC_BATTERY },
 	{ HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_NUMPAD_2021),
 		.driver_data = APPLE_HAS_FN | APPLE_ISO_TILDE_QUIRK },
+	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_TOUCHBAR_BACKLIGHT),
+		.driver_data = APPLE_MAGIC_BACKLIGHT },
 
 	{ }
 };
diff --git a/drivers/hid/hid-appletb-bl.c b/drivers/hid/hid-appletb-bl.c
new file mode 100644
index 000000000000..00bbe45df4fa
--- /dev/null
+++ b/drivers/hid/hid-appletb-bl.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Apple Touch Bar Backlight Driver
+ *
+ * Copyright (c) 2017-2018 Ronald Tschalär
+ * Copyright (c) 2022-2023 Kerem Karabay <kekrby@gmail.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/hid.h>
+#include <linux/backlight.h>
+
+#include "hid-ids.h"
+
+#define APPLETB_BL_ON			1
+#define APPLETB_BL_DIM			3
+#define APPLETB_BL_OFF			4
+
+#define HID_UP_APPLEVENDOR_TB_BL	0xff120000
+
+#define HID_VD_APPLE_TB_BRIGHTNESS	0xff120001
+#define HID_USAGE_AUX1			0xff120020
+#define HID_USAGE_BRIGHTNESS		0xff120021
+
+static int appletb_bl_def_brightness = 2;
+module_param_named(brightness, appletb_bl_def_brightness, int, 0444);
+MODULE_PARM_DESC(brightness, "Default brightness:\n"
+			 "    0 - Touchbar is off\n"
+			 "    1 - Dim brightness\n"
+			 "    [2] - Full brightness");
+
+struct appletb_bl {
+	struct hid_field *aux1_field, *brightness_field;
+	struct backlight_device *bdev;
+
+	bool full_on;
+};
+
+const u8 appletb_bl_brightness_map[] = {
+	APPLETB_BL_OFF,
+	APPLETB_BL_DIM,
+	APPLETB_BL_ON
+};
+
+static int appletb_bl_set_brightness(struct appletb_bl *bl, u8 brightness)
+{
+	struct hid_report *report = bl->brightness_field->report;
+	struct hid_device *hdev = report->device;
+	int ret;
+
+	ret = hid_set_field(bl->aux1_field, 0, 1);
+	if (ret) {
+		hid_err(hdev, "Failed to set auxiliary field (%pe)\n", ERR_PTR(ret));
+		return ret;
+	}
+
+	ret = hid_set_field(bl->brightness_field, 0, brightness);
+	if (ret) {
+		hid_err(hdev, "Failed to set brightness field (%pe)\n", ERR_PTR(ret));
+		return ret;
+	}
+
+	if (!bl->full_on) {
+		ret = hid_hw_power(hdev, PM_HINT_FULLON);
+		if (ret < 0) {
+			hid_err(hdev, "Device didn't power on (%pe)\n", ERR_PTR(ret));
+			return ret;
+		}
+
+		bl->full_on = true;
+	}
+
+	hid_hw_request(hdev, report, HID_REQ_SET_REPORT);
+
+	if (brightness == APPLETB_BL_OFF) {
+		hid_hw_power(hdev, PM_HINT_NORMAL);
+		bl->full_on = false;
+	}
+
+	return 0;
+}
+
+static int appletb_bl_update_status(struct backlight_device *bdev)
+{
+	struct appletb_bl *bl = bl_get_data(bdev);
+	u16 brightness;
+
+	if (bdev->props.state & BL_CORE_SUSPENDED)
+		brightness = 0;
+	else
+		brightness = backlight_get_brightness(bdev);
+
+	return appletb_bl_set_brightness(bl, appletb_bl_brightness_map[brightness]);
+}
+
+static const struct backlight_ops appletb_bl_backlight_ops = {
+	.options = BL_CORE_SUSPENDRESUME,
+	.update_status = appletb_bl_update_status,
+};
+
+static int appletb_bl_probe(struct hid_device *hdev, const struct hid_device_id *id)
+{
+	struct hid_field *aux1_field, *brightness_field;
+	struct backlight_properties bl_props = { 0 };
+	struct device *dev = &hdev->dev;
+	struct appletb_bl *bl;
+	int ret;
+
+	ret = hid_parse(hdev);
+	if (ret)
+		return dev_err_probe(dev, ret, "HID parse failed\n");
+
+	aux1_field = hid_find_field(hdev, HID_FEATURE_REPORT,
+				    HID_VD_APPLE_TB_BRIGHTNESS, HID_USAGE_AUX1);
+
+	brightness_field = hid_find_field(hdev, HID_FEATURE_REPORT,
+					  HID_VD_APPLE_TB_BRIGHTNESS, HID_USAGE_BRIGHTNESS);
+
+	if (!aux1_field || !brightness_field)
+		return -ENODEV;
+
+	if (aux1_field->report != brightness_field->report)
+		return dev_err_probe(dev, -ENODEV, "Encountered unexpected report structure\n");
+
+	bl = devm_kzalloc(dev, sizeof(*bl), GFP_KERNEL);
+	if (!bl)
+		return -ENOMEM;
+
+	ret = hid_hw_start(hdev, HID_CONNECT_DRIVER);
+	if (ret)
+		return dev_err_probe(dev, ret, "HID hardware start failed\n");
+
+	ret = hid_hw_open(hdev);
+	if (ret) {
+		dev_err_probe(dev, ret, "HID hardware open failed\n");
+		goto stop_hw;
+	}
+
+	bl->aux1_field = aux1_field;
+	bl->brightness_field = brightness_field;
+
+	if (appletb_bl_def_brightness == 0)
+		ret = appletb_bl_set_brightness(bl, APPLETB_BL_OFF);
+	else if (appletb_bl_def_brightness == 1)
+		ret = appletb_bl_set_brightness(bl, APPLETB_BL_DIM);
+	else
+		ret = appletb_bl_set_brightness(bl, APPLETB_BL_ON);
+
+	if (ret) {
+		dev_err_probe(dev, ret, "Failed to set touch bar brightness to off\n");
+		goto close_hw;
+	}
+
+	bl_props.type = BACKLIGHT_RAW;
+	bl_props.max_brightness = ARRAY_SIZE(appletb_bl_brightness_map) - 1;
+
+	bl->bdev = devm_backlight_device_register(dev, "appletb_backlight", dev, bl,
+						  &appletb_bl_backlight_ops, &bl_props);
+	if (IS_ERR(bl->bdev)) {
+		ret = PTR_ERR(bl->bdev);
+		dev_err_probe(dev, ret, "Failed to register backlight device\n");
+		goto close_hw;
+	}
+
+	hid_set_drvdata(hdev, bl);
+
+	return 0;
+
+close_hw:
+	hid_hw_close(hdev);
+stop_hw:
+	hid_hw_stop(hdev);
+
+	return ret;
+}
+
+static void appletb_bl_remove(struct hid_device *hdev)
+{
+	struct appletb_bl *bl = hid_get_drvdata(hdev);
+
+	appletb_bl_set_brightness(bl, APPLETB_BL_OFF);
+
+	hid_hw_close(hdev);
+	hid_hw_stop(hdev);
+}
+
+static const struct hid_device_id appletb_bl_hid_ids[] = {
+	/* MacBook Pro's 2018, 2019, with T2 chip: iBridge DFR Brightness */
+	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_TOUCHBAR_BACKLIGHT) },
+	{ }
+};
+MODULE_DEVICE_TABLE(hid, appletb_bl_hid_ids);
+
+static struct hid_driver appletb_bl_hid_driver = {
+	.name = "hid-appletb-bl",
+	.id_table = appletb_bl_hid_ids,
+	.probe = appletb_bl_probe,
+	.remove = appletb_bl_remove,
+};
+module_hid_driver(appletb_bl_hid_driver);
+
+MODULE_AUTHOR("Ronald Tschalär");
+MODULE_AUTHOR("Kerem Karabay <kekrby@gmail.com>");
+MODULE_DESCRIPTION("MacBookPro Touch Bar Backlight Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/hid/hid-appletb-kbd.c b/drivers/hid/hid-appletb-kbd.c
new file mode 100644
index 000000000000..ec8051dcf4db
--- /dev/null
+++ b/drivers/hid/hid-appletb-kbd.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Apple Touch Bar Keyboard Mode Driver
+ *
+ * Copyright (c) 2017-2018 Ronald Tschalär
+ * Copyright (c) 2022-2023 Kerem Karabay <kekrby@gmail.com>
+ * Copyright (c) 2024 Aditya Garg <gargaditya08@live.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/hid.h>
+#include <linux/usb.h>
+#include <linux/input.h>
+#include <linux/sysfs.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/input/sparse-keymap.h>
+
+#include "hid-ids.h"
+
+#define APPLETB_KBD_MODE_ESC	0
+#define APPLETB_KBD_MODE_FN	1
+#define APPLETB_KBD_MODE_SPCL	2
+#define APPLETB_KBD_MODE_OFF	3
+#define APPLETB_KBD_MODE_MAX	APPLETB_KBD_MODE_OFF
+
+#define APPLETB_DEVID_KEYBOARD	1
+
+#define HID_USAGE_MODE		0x00ff0004
+
+static int appletb_tb_def_mode = APPLETB_KBD_MODE_FN;
+module_param_named(mode, appletb_tb_def_mode, int, 0444);
+MODULE_PARM_DESC(mode, "Default touchbar mode:\n"
+			 "    0 - escape key only\n"
+			 "    [1] - function-keys only\n"
+			 "    2 - special keys only");
+
+static bool appletb_tb_fn_toggle = true;
+module_param_named(fntoggle, appletb_tb_fn_toggle, bool, 0644);
+MODULE_PARM_DESC(fntoggle, "Switch between Fn and media controls on pressing Fn key");
+
+struct appletb_kbd {
+	struct hid_field *mode_field;
+
+	u8 saved_mode;
+	u8 current_mode;
+	struct input_handler inp_handler;
+	struct input_handle kbd_handle;
+
+};
+
+static const struct key_entry appletb_kbd_keymap[] = {
+	{ KE_KEY, KEY_ESC, { KEY_ESC } },
+	{ KE_KEY, KEY_F1,  { KEY_BRIGHTNESSDOWN } },
+	{ KE_KEY, KEY_F2,  { KEY_BRIGHTNESSUP } },
+	{ KE_KEY, KEY_F3,  { KEY_RESERVED } },
+	{ KE_KEY, KEY_F4,  { KEY_RESERVED } },
+	{ KE_KEY, KEY_F5,  { KEY_KBDILLUMDOWN } },
+	{ KE_KEY, KEY_F6,  { KEY_KBDILLUMUP } },
+	{ KE_KEY, KEY_F7,  { KEY_PREVIOUSSONG } },
+	{ KE_KEY, KEY_F8,  { KEY_PLAYPAUSE } },
+	{ KE_KEY, KEY_F9,  { KEY_NEXTSONG } },
+	{ KE_KEY, KEY_F10, { KEY_MUTE } },
+	{ KE_KEY, KEY_F11, { KEY_VOLUMEDOWN } },
+	{ KE_KEY, KEY_F12, { KEY_VOLUMEUP } },
+	{ KE_END, 0 }
+};
+
+static int appletb_kbd_set_mode(struct appletb_kbd *kbd, u8 mode)
+{
+	struct hid_report *report = kbd->mode_field->report;
+	struct hid_device *hdev = report->device;
+	int ret;
+
+	ret = hid_hw_power(hdev, PM_HINT_FULLON);
+	if (ret) {
+		hid_err(hdev, "Device didn't resume (%pe)\n", ERR_PTR(ret));
+		return ret;
+	}
+
+	ret = hid_set_field(kbd->mode_field, 0, mode);
+	if (ret) {
+		hid_err(hdev, "Failed to set mode field to %u (%pe)\n", mode, ERR_PTR(ret));
+		goto power_normal;
+	}
+
+	hid_hw_request(hdev, report, HID_REQ_SET_REPORT);
+
+	kbd->current_mode = mode;
+
+power_normal:
+	hid_hw_power(hdev, PM_HINT_NORMAL);
+
+	return ret;
+}
+
+static ssize_t mode_show(struct device *dev,
+			 struct device_attribute *attr, char *buf)
+{
+	struct appletb_kbd *kbd = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%d\n", kbd->current_mode);
+}
+
+static ssize_t mode_store(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf, size_t size)
+{
+	struct appletb_kbd *kbd = dev_get_drvdata(dev);
+	u8 mode;
+	int ret;
+
+	ret = kstrtou8(buf, 0, &mode);
+	if (ret)
+		return ret;
+
+	if (mode > APPLETB_KBD_MODE_MAX)
+		return -EINVAL;
+
+	ret = appletb_kbd_set_mode(kbd, mode);
+
+	return ret < 0 ? ret : size;
+}
+static DEVICE_ATTR_RW(mode);
+
+struct attribute *appletb_kbd_attrs[] = {
+	&dev_attr_mode.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(appletb_kbd);
+
+static int appletb_tb_key_to_slot(unsigned int code)
+{
+	switch (code) {
+	case KEY_ESC:
+		return 0;
+	case KEY_F1 ... KEY_F10:
+		return code - KEY_F1 + 1;
+	case KEY_F11 ... KEY_F12:
+		return code - KEY_F11 + 11;
+
+	default:
+		return -EINVAL;
+	}
+}
+
+static int appletb_kbd_hid_event(struct hid_device *hdev, struct hid_field *field,
+				      struct hid_usage *usage, __s32 value)
+{
+	struct appletb_kbd *kbd = hid_get_drvdata(hdev);
+	struct key_entry *translation;
+	struct input_dev *input;
+	int slot;
+
+	if ((usage->hid & HID_USAGE_PAGE) != HID_UP_KEYBOARD || usage->type != EV_KEY)
+		return 0;
+
+	input = field->hidinput->input;
+
+	/*
+	 * Skip non-touch-bar keys.
+	 *
+	 * Either the touch bar itself or usbhid generate a slew of key-down
+	 * events for all the meta keys. None of which we're at all interested
+	 * in.
+	 */
+	slot = appletb_tb_key_to_slot(usage->code);
+	if (slot < 0)
+		return 0;
+
+	translation = sparse_keymap_entry_from_scancode(input, usage->code);
+
+	if (translation && kbd->current_mode == APPLETB_KBD_MODE_SPCL) {
+		input_event(input, usage->type, translation->keycode, value);
+
+		return 1;
+	}
+
+	return kbd->current_mode == APPLETB_KBD_MODE_OFF;
+}
+
+static void appletb_kbd_inp_event(struct input_handle *handle, unsigned int type,
+			      unsigned int code, int value)
+{
+	struct appletb_kbd *kbd = handle->private;
+
+	if (type == EV_KEY && code == KEY_FN && appletb_tb_fn_toggle) {
+		if (value == 1) {
+			kbd->saved_mode = kbd->current_mode;
+			if (kbd->current_mode == APPLETB_KBD_MODE_SPCL)
+				appletb_kbd_set_mode(kbd, APPLETB_KBD_MODE_FN);
+			else if (kbd->current_mode == APPLETB_KBD_MODE_FN)
+				appletb_kbd_set_mode(kbd, APPLETB_KBD_MODE_SPCL);
+		} else if (value == 0) {
+			if (kbd->saved_mode != kbd->current_mode)
+				appletb_kbd_set_mode(kbd, kbd->saved_mode);
+		}
+	}
+}
+
+static int appletb_kbd_inp_connect(struct input_handler *handler,
+			       struct input_dev *dev,
+			       const struct input_device_id *id)
+{
+	struct appletb_kbd *kbd = handler->private;
+	struct input_handle *handle;
+	int rc;
+
+	if (id->driver_info == APPLETB_DEVID_KEYBOARD) {
+		handle = &kbd->kbd_handle;
+		handle->name = "tbkbd";
+	} else {
+		return -ENOENT;
+	}
+
+	if (handle->dev)
+		return -EEXIST;
+
+	handle->open = 0;
+	handle->dev = input_get_device(dev);
+	handle->handler = handler;
+	handle->private = kbd;
+
+	rc = input_register_handle(handle);
+	if (rc)
+		goto err_free_dev;
+
+	rc = input_open_device(handle);
+	if (rc)
+		goto err_unregister_handle;
+
+	return 0;
+
+ err_unregister_handle:
+	input_unregister_handle(handle);
+ err_free_dev:
+	input_put_device(handle->dev);
+	handle->dev = NULL;
+	return rc;
+}
+
+static void appletb_kbd_inp_disconnect(struct input_handle *handle)
+{
+	input_close_device(handle);
+	input_unregister_handle(handle);
+
+	input_put_device(handle->dev);
+	handle->dev = NULL;
+}
+
+static int appletb_kbd_input_configured(struct hid_device *hdev, struct hid_input *hidinput)
+{
+	int idx;
+	struct input_dev *input = hidinput->input;
+
+	/*
+	 * Clear various input capabilities that are blindly set by the hid
+	 * driver (usbkbd.c)
+	 */
+	memset(input->evbit, 0, sizeof(input->evbit));
+	memset(input->keybit, 0, sizeof(input->keybit));
+	memset(input->ledbit, 0, sizeof(input->ledbit));
+
+	__set_bit(EV_REP, input->evbit);
+
+	sparse_keymap_setup(input, appletb_kbd_keymap, NULL);
+
+	for (idx = 0; appletb_kbd_keymap[idx].type != KE_END; idx++) {
+		input_set_capability(input, EV_KEY, appletb_kbd_keymap[idx].code);
+	}
+
+	return 0;
+}
+
+static const struct input_device_id appletb_kbd_input_devices[] = {
+	{
+		.flags = INPUT_DEVICE_ID_MATCH_BUS |
+			INPUT_DEVICE_ID_MATCH_VENDOR |
+			INPUT_DEVICE_ID_MATCH_KEYBIT,
+		.bustype = BUS_USB,
+		.vendor = USB_VENDOR_ID_APPLE,
+		.keybit = { [BIT_WORD(KEY_FN)] = BIT_MASK(KEY_FN) },
+		.driver_info = APPLETB_DEVID_KEYBOARD,
+	},
+	{ }
+};
+
+static bool appletb_kbd_match_internal_device(struct input_handler *handler,
+					  struct input_dev *inp_dev)
+{
+	struct device *dev = &inp_dev->dev;
+
+	/* in kernel: dev && !is_usb_device(dev) */
+	while (dev && !(dev->type && dev->type->name &&
+			!strcmp(dev->type->name, "usb_device")))
+		dev = dev->parent;
+
+	/*
+	 * Apple labels all their internal keyboards and trackpads as such,
+	 * instead of maintaining an ever expanding list of product-id's we
+	 * just look at the device's product name.
+	 */
+	if (dev)
+		return !!strstr(to_usb_device(dev)->product, "Internal Keyboard");
+
+	return false;
+}
+
+static int appletb_kbd_probe(struct hid_device *hdev, const struct hid_device_id *id)
+{
+	struct appletb_kbd *kbd;
+	struct device *dev = &hdev->dev;
+	struct hid_field *mode_field;
+	int ret;
+
+	ret = hid_parse(hdev);
+	if (ret)
+		return dev_err_probe(dev, ret, "HID parse failed\n");
+
+	mode_field = hid_find_field(hdev, HID_OUTPUT_REPORT,
+				    HID_GD_KEYBOARD, HID_USAGE_MODE);
+	if (!mode_field)
+		return -ENODEV;
+
+	kbd = devm_kzalloc(dev, sizeof(*kbd), GFP_KERNEL);
+	if (!kbd)
+		return -ENOMEM;
+
+	kbd->mode_field = mode_field;
+
+	ret = hid_hw_start(hdev, HID_CONNECT_HIDINPUT);
+	if (ret)
+		return dev_err_probe(dev, ret, "HID hw start failed\n");
+
+	ret = hid_hw_open(hdev);
+	if (ret) {
+		dev_err_probe(dev, ret, "HID hw open failed\n");
+		goto stop_hw;
+	}
+
+	kbd->inp_handler.event = appletb_kbd_inp_event;
+	kbd->inp_handler.connect = appletb_kbd_inp_connect;
+	kbd->inp_handler.disconnect = appletb_kbd_inp_disconnect;
+	kbd->inp_handler.name = "appletb";
+	kbd->inp_handler.id_table = appletb_kbd_input_devices;
+	kbd->inp_handler.match = appletb_kbd_match_internal_device;
+	kbd->inp_handler.private = kbd;
+
+	ret = input_register_handler(&kbd->inp_handler);
+	if (ret) {
+		dev_err_probe(dev, ret, "Unable to register keyboard handler\n");
+		goto close_hw;
+	}
+
+	ret = appletb_kbd_set_mode(kbd, appletb_tb_def_mode);
+	if (ret) {
+		dev_err_probe(dev, ret, "Failed to set touchbar mode\n");
+		goto close_hw;
+	}
+
+	hid_set_drvdata(hdev, kbd);
+
+	return 0;
+
+close_hw:
+	hid_hw_close(hdev);
+stop_hw:
+	hid_hw_stop(hdev);
+	return ret;
+}
+
+static void appletb_kbd_remove(struct hid_device *hdev)
+{
+	struct appletb_kbd *kbd = hid_get_drvdata(hdev);
+
+	appletb_kbd_set_mode(kbd, APPLETB_KBD_MODE_OFF);
+
+	input_unregister_handler(&kbd->inp_handler);
+
+	hid_hw_close(hdev);
+	hid_hw_stop(hdev);
+}
+
+#ifdef CONFIG_PM
+static int appletb_kbd_suspend(struct hid_device *hdev, pm_message_t msg)
+{
+	struct appletb_kbd *kbd = hid_get_drvdata(hdev);
+
+	kbd->saved_mode = kbd->current_mode;
+	appletb_kbd_set_mode(kbd, APPLETB_KBD_MODE_OFF);
+
+	return 0;
+}
+
+static int appletb_kbd_reset_resume(struct hid_device *hdev)
+{
+	struct appletb_kbd *kbd = hid_get_drvdata(hdev);
+
+	appletb_kbd_set_mode(kbd, kbd->saved_mode);
+
+	return 0;
+}
+#endif
+
+static const struct hid_device_id appletb_kbd_hid_ids[] = {
+	/* MacBook Pro's 2018, 2019, with T2 chip: iBridge Display */
+	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_TOUCHBAR_DISPLAY) },
+	{ }
+};
+MODULE_DEVICE_TABLE(hid, appletb_kbd_hid_ids);
+
+static struct hid_driver appletb_kbd_hid_driver = {
+	.name = "hid-appletb-kbd",
+	.id_table = appletb_kbd_hid_ids,
+	.probe = appletb_kbd_probe,
+	.remove = appletb_kbd_remove,
+	.event = appletb_kbd_hid_event,
+	.input_configured = appletb_kbd_input_configured,
+#ifdef CONFIG_PM
+	.suspend = appletb_kbd_suspend,
+	.reset_resume = appletb_kbd_reset_resume,
+#endif
+	.driver.dev_groups = appletb_kbd_groups,
+};
+module_hid_driver(appletb_kbd_hid_driver);
+
+MODULE_AUTHOR("Ronald Tschalär");
+MODULE_AUTHOR("Kerem Karabay <kekrby@gmail.com>");
+MODULE_DESCRIPTION("MacBookPro Touch Bar Keyboard Mode Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 74efda212c55..f4379efdbf30 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1912,6 +1912,31 @@ int hid_set_field(struct hid_field *field, unsigned offset, __s32 value)
 }
 EXPORT_SYMBOL_GPL(hid_set_field);
 
+struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_type,
+				 unsigned int application, unsigned int usage)
+{
+	struct list_head *report_list = &hdev->report_enum[report_type].report_list;
+	struct hid_report *report;
+	int i, j;
+
+	list_for_each_entry(report, report_list, list) {
+		if (report->application != application)
+			continue;
+
+		for (i = 0; i < report->maxfield; i++) {
+			struct hid_field *field = report->field[i];
+
+			for (j = 0; j < field->maxusage; j++) {
+				if (field->usage[j].hid == usage)
+					return field;
+			}
+		}
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(hid_find_field);
+
 static struct hid_report *hid_get_report(struct hid_report_enum *report_enum,
 		const u8 *data)
 {
diff --git a/drivers/hid/hid-google-hammer.c b/drivers/hid/hid-google-hammer.c
index 25331695ae32..3380694ba18c 100644
--- a/drivers/hid/hid-google-hammer.c
+++ b/drivers/hid/hid-google-hammer.c
@@ -418,38 +418,15 @@ static int hammer_event(struct hid_device *hid, struct hid_field *field,
 	return 0;
 }
 
-static bool hammer_has_usage(struct hid_device *hdev, unsigned int report_type,
-			unsigned application, unsigned usage)
-{
-	struct hid_report_enum *re = &hdev->report_enum[report_type];
-	struct hid_report *report;
-	int i, j;
-
-	list_for_each_entry(report, &re->report_list, list) {
-		if (report->application != application)
-			continue;
-
-		for (i = 0; i < report->maxfield; i++) {
-			struct hid_field *field = report->field[i];
-
-			for (j = 0; j < field->maxusage; j++)
-				if (field->usage[j].hid == usage)
-					return true;
-		}
-	}
-
-	return false;
-}
-
 static bool hammer_has_folded_event(struct hid_device *hdev)
 {
-	return hammer_has_usage(hdev, HID_INPUT_REPORT,
+	return !!hid_find_field(hdev, HID_INPUT_REPORT,
 				HID_GD_KEYBOARD, HID_USAGE_KBD_FOLDED);
 }
 
 static bool hammer_has_backlight_control(struct hid_device *hdev)
 {
-	return hammer_has_usage(hdev, HID_OUTPUT_REPORT,
+	return !!hid_find_field(hdev, HID_OUTPUT_REPORT,
 				HID_GD_KEYBOARD, HID_AD_BRIGHTNESS);
 }
 
diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c
index 56fc78841f24..0fed955364c3 100644
--- a/drivers/hid/hid-multitouch.c
+++ b/drivers/hid/hid-multitouch.c
@@ -72,6 +72,7 @@ MODULE_LICENSE("GPL");
 #define MT_QUIRK_FORCE_MULTI_INPUT	BIT(20)
 #define MT_QUIRK_DISABLE_WAKEUP		BIT(21)
 #define MT_QUIRK_ORIENTATION_INVERT	BIT(22)
+#define MT_QUIRK_TOUCH_IS_TIPSTATE	BIT(23)
 
 #define MT_INPUTMODE_TOUCHSCREEN	0x02
 #define MT_INPUTMODE_TOUCHPAD		0x03
@@ -145,6 +146,7 @@ struct mt_class {
 	__s32 sn_height;	/* Signal/noise ratio for height events */
 	__s32 sn_pressure;	/* Signal/noise ratio for pressure events */
 	__u8 maxcontacts;
+	bool is_direct;	/* true for touchscreens */
 	bool is_indirect;	/* true for touchpads */
 	bool export_all_inputs;	/* do not ignore mouse, keyboards, etc... */
 };
@@ -212,6 +214,7 @@ static void mt_post_parse(struct mt_device *td, struct mt_application *app);
 #define MT_CLS_GOOGLE				0x0111
 #define MT_CLS_RAZER_BLADE_STEALTH		0x0112
 #define MT_CLS_SMART_TECH			0x0113
+#define MT_CLS_APPLE_TOUCHBAR			0x0114
 
 #define MT_DEFAULT_MAXCONTACT	10
 #define MT_MAX_MAXCONTACT	250
@@ -396,6 +399,13 @@ static const struct mt_class mt_classes[] = {
 			MT_QUIRK_CONTACT_CNT_ACCURATE |
 			MT_QUIRK_SEPARATE_APP_REPORT,
 	},
+	{ .name = MT_CLS_APPLE_TOUCHBAR,
+		.quirks = MT_QUIRK_HOVERING |
+			MT_QUIRK_TOUCH_IS_TIPSTATE |
+			MT_QUIRK_SLOT_IS_CONTACTID_MINUS_ONE,
+		.is_direct = true,
+		.maxcontacts = 11,
+	},
 	{ }
 };
 
@@ -489,9 +499,6 @@ static void mt_feature_mapping(struct hid_device *hdev,
 		if (!td->maxcontacts &&
 		    field->logical_maximum <= MT_MAX_MAXCONTACT)
 			td->maxcontacts = field->logical_maximum;
-		if (td->mtclass.maxcontacts)
-			/* check if the maxcontacts is given by the class */
-			td->maxcontacts = td->mtclass.maxcontacts;
 
 		break;
 	case HID_DG_BUTTONTYPE:
@@ -565,13 +572,13 @@ static struct mt_application *mt_allocate_application(struct mt_device *td,
 	mt_application->application = application;
 	INIT_LIST_HEAD(&mt_application->mt_usages);
 
-	if (application == HID_DG_TOUCHSCREEN)
+	if (application == HID_DG_TOUCHSCREEN && !td->mtclass.is_indirect)
 		mt_application->mt_flags |= INPUT_MT_DIRECT;
 
 	/*
 	 * Model touchscreens providing buttons as touchpads.
 	 */
-	if (application == HID_DG_TOUCHPAD) {
+	if (application == HID_DG_TOUCHPAD && !td->mtclass.is_direct) {
 		mt_application->mt_flags |= INPUT_MT_POINTER;
 		td->inputmode_value = MT_INPUTMODE_TOUCHPAD;
 	}
@@ -635,7 +642,9 @@ static struct mt_report_data *mt_allocate_report_data(struct mt_device *td,
 
 		if (field->logical == HID_DG_FINGER || td->hdev->group != HID_GROUP_MULTITOUCH_WIN_8) {
 			for (n = 0; n < field->report_count; n++) {
-				if (field->usage[n].hid == HID_DG_CONTACTID) {
+				unsigned int hid = field->usage[n].hid;
+
+				if (hid == HID_DG_CONTACTID || hid == HID_DG_TRANSDUCER_INDEX) {
 					rdata->is_mt_collection = true;
 					break;
 				}
@@ -807,6 +816,15 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 
 			MT_STORE_FIELD(confidence_state);
 			return 1;
+		case HID_DG_TOUCH:
+			/*
+			 * Legacy devices use TIPSWITCH and not TOUCH.
+			 * Let's just ignore this field unless the quirk is set.
+			 */
+			if (!(cls->quirks & MT_QUIRK_TOUCH_IS_TIPSTATE))
+				return -1;
+
+			fallthrough;
 		case HID_DG_TIPSWITCH:
 			if (field->application != HID_GD_SYSTEM_MULTIAXIS)
 				input_set_capability(hi->input,
@@ -814,6 +832,7 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 			MT_STORE_FIELD(tip_state);
 			return 1;
 		case HID_DG_CONTACTID:
+		case HID_DG_TRANSDUCER_INDEX:
 			MT_STORE_FIELD(contactid);
 			app->touches_by_report++;
 			return 1;
@@ -869,10 +888,6 @@ static int mt_touch_input_mapping(struct hid_device *hdev, struct hid_input *hi,
 		case HID_DG_CONTACTMAX:
 			/* contact max are global to the report */
 			return -1;
-		case HID_DG_TOUCH:
-			/* Legacy devices use TIPSWITCH and not TOUCH.
-			 * Let's just ignore this field. */
-			return -1;
 		}
 		/* let hid-input decide for the others */
 		return 0;
@@ -1300,6 +1315,10 @@ static int mt_touch_input_configured(struct hid_device *hdev,
 	struct input_dev *input = hi->input;
 	int ret;
 
+	/* check if the maxcontacts is given by the class */
+	if (cls->maxcontacts)
+		td->maxcontacts = cls->maxcontacts;
+
 	if (!td->maxcontacts)
 		td->maxcontacts = MT_DEFAULT_MAXCONTACT;
 
@@ -1307,6 +1326,9 @@ static int mt_touch_input_configured(struct hid_device *hdev,
 	if (td->serial_maybe)
 		mt_post_parse_default_settings(td, app);
 
+	if (cls->is_direct)
+		app->mt_flags |= INPUT_MT_DIRECT;
+
 	if (cls->is_indirect)
 		app->mt_flags |= INPUT_MT_POINTER;
 
@@ -1733,6 +1755,15 @@ static int mt_probe(struct hid_device *hdev, const struct hid_device_id *id)
 		}
 	}
 
+	ret = hid_parse(hdev);
+	if (ret != 0)
+		return ret;
+
+	if (mtclass->name == MT_CLS_APPLE_TOUCHBAR &&
+	    !hid_find_field(hdev, HID_INPUT_REPORT,
+			    HID_DG_TOUCHPAD, HID_DG_TRANSDUCER_INDEX))
+		return -ENODEV;
+
 	td = devm_kzalloc(&hdev->dev, sizeof(struct mt_device), GFP_KERNEL);
 	if (!td) {
 		dev_err(&hdev->dev, "cannot allocate multitouch data\n");
@@ -1780,10 +1811,6 @@ static int mt_probe(struct hid_device *hdev, const struct hid_device_id *id)
 
 	timer_setup(&td->release_timer, mt_expired_timeout, 0);
 
-	ret = hid_parse(hdev);
-	if (ret != 0)
-		return ret;
-
 	if (mtclass->quirks & MT_QUIRK_FIX_CONST_CONTACT_ID)
 		mt_fix_const_fields(hdev, HID_DG_CONTACTID);
 
@@ -2235,6 +2262,11 @@ static const struct hid_device_id mt_devices[] = {
 		MT_USB_DEVICE(USB_VENDOR_ID_XIROKU,
 			USB_DEVICE_ID_XIROKU_CSR2) },
 
+	/* Apple Touch Bars */
+	{ .driver_data = MT_CLS_APPLE_TOUCHBAR,
+		HID_USB_DEVICE(USB_VENDOR_ID_APPLE,
+			       USB_DEVICE_ID_APPLE_TOUCHBAR_DISPLAY) },
+
 	/* Google MT devices */
 	{ .driver_data = MT_CLS_GOOGLE,
 		HID_DEVICE(HID_BUS_ANY, HID_GROUP_ANY, USB_VENDOR_ID_GOOGLE,
diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c
index e0bbf0c6345d..7c576d6540fe 100644
--- a/drivers/hid/hid-quirks.c
+++ b/drivers/hid/hid-quirks.c
@@ -328,8 +328,6 @@ static const struct hid_device_id hid_have_special_driver[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_GEYSER1_TP_ONLY) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_2021) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGIC_KEYBOARD_FINGERPRINT_2021) },
-	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_TOUCHBAR_BACKLIGHT) },
-	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_TOUCHBAR_DISPLAY) },
 #endif
 #if IS_ENABLED(CONFIG_HID_APPLEIR)
 	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL) },
@@ -338,6 +336,12 @@ static const struct hid_device_id hid_have_special_driver[] = {
 	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL4) },
 	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_IRCONTROL5) },
 #endif
+#if IS_ENABLED(CONFIG_HID_APPLETB_BL)
+	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_TOUCHBAR_BACKLIGHT) },
+#endif
+#if IS_ENABLED(CONFIG_HID_APPLETB_KBD)
+	{ HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_TOUCHBAR_DISPLAY) },
+#endif
 #if IS_ENABLED(CONFIG_HID_ASUS)
 	{ HID_I2C_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_I2C_KEYBOARD) },
 	{ HID_I2C_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_I2C_TOUCHPAD) },
diff --git a/drivers/hwmon/applesmc.c b/drivers/hwmon/applesmc.c
index fc6d6a9053ce..698f44794453 100644
--- a/drivers/hwmon/applesmc.c
+++ b/drivers/hwmon/applesmc.c
@@ -6,6 +6,7 @@
  *
  * Copyright (C) 2007 Nicolas Boichat <nicolas@boichat.ch>
  * Copyright (C) 2010 Henrik Rydberg <rydberg@euromail.se>
+ * Copyright (C) 2019 Paul Pawlowski <paul@mrarm.io>
  *
  * Based on hdaps.c driver:
  * Copyright (C) 2005 Robert Love <rml@novell.com>
@@ -18,7 +19,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/delay.h>
-#include <linux/platform_device.h>
+#include <linux/acpi.h>
 #include <linux/input.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -35,12 +36,24 @@
 #include <linux/bits.h>
 
 /* data port used by Apple SMC */
-#define APPLESMC_DATA_PORT	0x300
+#define APPLESMC_DATA_PORT	0
 /* command/status port used by Apple SMC */
-#define APPLESMC_CMD_PORT	0x304
+#define APPLESMC_CMD_PORT	4
 
 #define APPLESMC_NR_PORTS	32 /* 0x300-0x31f */
 
+#define APPLESMC_IOMEM_KEY_DATA	0
+#define APPLESMC_IOMEM_KEY_STATUS	0x4005
+#define APPLESMC_IOMEM_KEY_NAME	0x78
+#define APPLESMC_IOMEM_KEY_DATA_LEN	0x7D
+#define APPLESMC_IOMEM_KEY_SMC_ID	0x7E
+#define APPLESMC_IOMEM_KEY_CMD		0x7F
+#define APPLESMC_IOMEM_MIN_SIZE	0x4006
+
+#define APPLESMC_IOMEM_KEY_TYPE_CODE		0
+#define APPLESMC_IOMEM_KEY_TYPE_DATA_LEN	5
+#define APPLESMC_IOMEM_KEY_TYPE_FLAGS		6
+
 #define APPLESMC_MAX_DATA_LENGTH 32
 
 /* Apple SMC status bits */
@@ -74,6 +87,7 @@
 #define FAN_ID_FMT		"F%dID" /* r-o char[16] */
 
 #define TEMP_SENSOR_TYPE	"sp78"
+#define FLOAT_TYPE		"flt "
 
 /* List of keys used to read/write fan speeds */
 static const char *const fan_speed_fmt[] = {
@@ -83,6 +97,7 @@ static const char *const fan_speed_fmt[] = {
 	"F%dSf",		/* safe speed - not all models */
 	"F%dTg",		/* target speed (manual: rw) */
 };
+#define FAN_MANUAL_FMT "F%dMd"
 
 #define INIT_TIMEOUT_MSECS	5000	/* wait up to 5s for device init ... */
 #define INIT_WAIT_MSECS		50	/* ... in 50ms increments */
@@ -119,7 +134,7 @@ struct applesmc_entry {
 };
 
 /* Register lookup and registers common to all SMCs */
-static struct applesmc_registers {
+struct applesmc_registers {
 	struct mutex mutex;		/* register read/write mutex */
 	unsigned int key_count;		/* number of SMC registers */
 	unsigned int fan_count;		/* number of fans */
@@ -133,26 +148,38 @@ static struct applesmc_registers {
 	bool init_complete;		/* true when fully initialized */
 	struct applesmc_entry *cache;	/* cached key entries */
 	const char **index;		/* temperature key index */
-} smcreg = {
-	.mutex = __MUTEX_INITIALIZER(smcreg.mutex),
 };
 
-static const int debug;
-static struct platform_device *pdev;
-static s16 rest_x;
-static s16 rest_y;
-static u8 backlight_state[2];
+struct applesmc_device {
+	struct acpi_device *dev;
+	struct device *ldev;
+	struct applesmc_registers reg;
 
-static struct device *hwmon_dev;
-static struct input_dev *applesmc_idev;
+	bool port_base_set, iomem_base_set;
+	u16 port_base;
+	u8 *__iomem iomem_base;
+	u32 iomem_base_addr, iomem_base_size;
 
-/*
- * Last index written to key_at_index sysfs file, and value to use for all other
- * key_at_index_* sysfs files.
- */
-static unsigned int key_at_index;
+	s16 rest_x;
+	s16 rest_y;
+
+	u8 backlight_state[2];
+
+	struct device *hwmon_dev;
+	struct input_dev *idev;
+
+	/*
+	 * Last index written to key_at_index sysfs file, and value to use for all other
+	 * key_at_index_* sysfs files.
+	 */
+	unsigned int key_at_index;
 
-static struct workqueue_struct *applesmc_led_wq;
+	struct workqueue_struct *backlight_wq;
+	struct work_struct backlight_work;
+	struct led_classdev backlight_dev;
+};
+
+static const int debug;
 
 /*
  * Wait for specific status bits with a mask on the SMC.
@@ -162,7 +189,7 @@ static struct workqueue_struct *applesmc_led_wq;
  * run out past 500ms.
  */
 
-static int wait_status(u8 val, u8 mask)
+static int port_wait_status(struct applesmc_device *smc, u8 val, u8 mask)
 {
 	u8 status;
 	int us;
@@ -170,7 +197,7 @@ static int wait_status(u8 val, u8 mask)
 
 	us = APPLESMC_MIN_WAIT;
 	for (i = 0; i < 24 ; i++) {
-		status = inb(APPLESMC_CMD_PORT);
+		status = inb(smc->port_base + APPLESMC_CMD_PORT);
 		if ((status & mask) == val)
 			return 0;
 		usleep_range(us, us * 2);
@@ -180,13 +207,13 @@ static int wait_status(u8 val, u8 mask)
 	return -EIO;
 }
 
-/* send_byte - Write to SMC data port. Callers must hold applesmc_lock. */
+/* port_send_byte - Write to SMC data port. Callers must hold applesmc_lock. */
 
-static int send_byte(u8 cmd, u16 port)
+static int port_send_byte(struct applesmc_device *smc, u8 cmd, u16 port)
 {
 	int status;
 
-	status = wait_status(0, SMC_STATUS_IB_CLOSED);
+	status = port_wait_status(smc, 0, SMC_STATUS_IB_CLOSED);
 	if (status)
 		return status;
 	/*
@@ -195,24 +222,25 @@ static int send_byte(u8 cmd, u16 port)
 	 * this extra read may not happen if status returns both
 	 * simultaneously and this would appear to be required.
 	 */
-	status = wait_status(SMC_STATUS_BUSY, SMC_STATUS_BUSY);
+	status = port_wait_status(smc, SMC_STATUS_BUSY, SMC_STATUS_BUSY);
 	if (status)
 		return status;
 
-	outb(cmd, port);
+	outb(cmd, smc->port_base + port);
 	return 0;
 }
 
-/* send_command - Write a command to the SMC. Callers must hold applesmc_lock. */
+/* port_send_command - Write a command to the SMC. Callers must hold applesmc_lock. */
 
-static int send_command(u8 cmd)
+static int port_send_command(struct applesmc_device *smc, u8 cmd)
 {
 	int ret;
 
-	ret = wait_status(0, SMC_STATUS_IB_CLOSED);
+	ret = port_wait_status(smc, 0, SMC_STATUS_IB_CLOSED);
 	if (ret)
 		return ret;
-	outb(cmd, APPLESMC_CMD_PORT);
+
+	outb(cmd, smc->port_base + APPLESMC_CMD_PORT);
 	return 0;
 }
 
@@ -222,110 +250,304 @@ static int send_command(u8 cmd)
  * If busy is stuck high after the command then the SMC is jammed.
  */
 
-static int smc_sane(void)
+static int port_smc_sane(struct applesmc_device *smc)
 {
 	int ret;
 
-	ret = wait_status(0, SMC_STATUS_BUSY);
+	ret = port_wait_status(smc, 0, SMC_STATUS_BUSY);
 	if (!ret)
 		return ret;
-	ret = send_command(APPLESMC_READ_CMD);
+	ret = port_send_command(smc, APPLESMC_READ_CMD);
 	if (ret)
 		return ret;
-	return wait_status(0, SMC_STATUS_BUSY);
+	return port_wait_status(smc, 0, SMC_STATUS_BUSY);
 }
 
-static int send_argument(const char *key)
+static int port_send_argument(struct applesmc_device *smc, const char *key)
 {
 	int i;
 
 	for (i = 0; i < 4; i++)
-		if (send_byte(key[i], APPLESMC_DATA_PORT))
+		if (port_send_byte(smc, key[i], APPLESMC_DATA_PORT))
 			return -EIO;
 	return 0;
 }
 
-static int read_smc(u8 cmd, const char *key, u8 *buffer, u8 len)
+static int port_read_smc(struct applesmc_device *smc, u8 cmd, const char *key,
+	u8 *buffer, u8 len)
 {
 	u8 status, data = 0;
 	int i;
 	int ret;
 
-	ret = smc_sane();
+	ret = port_smc_sane(smc);
 	if (ret)
 		return ret;
 
-	if (send_command(cmd) || send_argument(key)) {
+	if (port_send_command(smc, cmd) || port_send_argument(smc, key)) {
 		pr_warn("%.4s: read arg fail\n", key);
 		return -EIO;
 	}
 
 	/* This has no effect on newer (2012) SMCs */
-	if (send_byte(len, APPLESMC_DATA_PORT)) {
+	if (port_send_byte(smc, len, APPLESMC_DATA_PORT)) {
 		pr_warn("%.4s: read len fail\n", key);
 		return -EIO;
 	}
 
 	for (i = 0; i < len; i++) {
-		if (wait_status(SMC_STATUS_AWAITING_DATA | SMC_STATUS_BUSY,
+		if (port_wait_status(smc,
+				SMC_STATUS_AWAITING_DATA | SMC_STATUS_BUSY,
 				SMC_STATUS_AWAITING_DATA | SMC_STATUS_BUSY)) {
 			pr_warn("%.4s: read data[%d] fail\n", key, i);
 			return -EIO;
 		}
-		buffer[i] = inb(APPLESMC_DATA_PORT);
+		buffer[i] = inb(smc->port_base + APPLESMC_DATA_PORT);
 	}
 
 	/* Read the data port until bit0 is cleared */
 	for (i = 0; i < 16; i++) {
 		udelay(APPLESMC_MIN_WAIT);
-		status = inb(APPLESMC_CMD_PORT);
+		status = inb(smc->port_base + APPLESMC_CMD_PORT);
 		if (!(status & SMC_STATUS_AWAITING_DATA))
 			break;
-		data = inb(APPLESMC_DATA_PORT);
+		data = inb(smc->port_base + APPLESMC_DATA_PORT);
 	}
 	if (i)
 		pr_warn("flushed %d bytes, last value is: %d\n", i, data);
 
-	return wait_status(0, SMC_STATUS_BUSY);
+	return port_wait_status(smc, 0, SMC_STATUS_BUSY);
 }
 
-static int write_smc(u8 cmd, const char *key, const u8 *buffer, u8 len)
+static int port_write_smc(struct applesmc_device *smc, u8 cmd, const char *key,
+	const u8 *buffer, u8 len)
 {
 	int i;
 	int ret;
 
-	ret = smc_sane();
+	ret = port_smc_sane(smc);
 	if (ret)
 		return ret;
 
-	if (send_command(cmd) || send_argument(key)) {
+	if (port_send_command(smc, cmd) || port_send_argument(smc, key)) {
 		pr_warn("%s: write arg fail\n", key);
 		return -EIO;
 	}
 
-	if (send_byte(len, APPLESMC_DATA_PORT)) {
+	if (port_send_byte(smc, len, APPLESMC_DATA_PORT)) {
 		pr_warn("%.4s: write len fail\n", key);
 		return -EIO;
 	}
 
 	for (i = 0; i < len; i++) {
-		if (send_byte(buffer[i], APPLESMC_DATA_PORT)) {
+		if (port_send_byte(smc, buffer[i], APPLESMC_DATA_PORT)) {
 			pr_warn("%s: write data fail\n", key);
 			return -EIO;
 		}
 	}
 
-	return wait_status(0, SMC_STATUS_BUSY);
+	return port_wait_status(smc, 0, SMC_STATUS_BUSY);
 }
 
-static int read_register_count(unsigned int *count)
+static int port_get_smc_key_info(struct applesmc_device *smc,
+	const char *key, struct applesmc_entry *info)
 {
-	__be32 be;
 	int ret;
+	u8 raw[6];
 
-	ret = read_smc(APPLESMC_READ_CMD, KEY_COUNT_KEY, (u8 *)&be, 4);
+	ret = port_read_smc(smc, APPLESMC_GET_KEY_TYPE_CMD, key, raw, 6);
 	if (ret)
 		return ret;
+	info->len = raw[0];
+	memcpy(info->type, &raw[1], 4);
+	info->flags = raw[5];
+	return 0;
+}
+
+
+/*
+ * MMIO based communication.
+ * TODO: Use updated mechanism for cmd timeout/retry
+ */
+
+static void iomem_clear_status(struct applesmc_device *smc)
+{
+	if (ioread8(smc->iomem_base + APPLESMC_IOMEM_KEY_STATUS))
+		iowrite8(0, smc->iomem_base + APPLESMC_IOMEM_KEY_STATUS);
+}
+
+static int iomem_wait_read(struct applesmc_device *smc)
+{
+	u8 status;
+	int us;
+	int i;
+
+	us = APPLESMC_MIN_WAIT;
+	for (i = 0; i < 24 ; i++) {
+		status = ioread8(smc->iomem_base + APPLESMC_IOMEM_KEY_STATUS);
+		if (status & 0x20)
+			return 0;
+		usleep_range(us, us * 2);
+		if (i > 9)
+			us <<= 1;
+	}
+
+	dev_warn(smc->ldev, "%s... timeout\n", __func__);
+	return -EIO;
+}
+
+static int iomem_read_smc(struct applesmc_device *smc, u8 cmd, const char *key,
+	u8 *buffer, u8 len)
+{
+	u8 err, remote_len;
+	u32 key_int = *((u32 *) key);
+
+	iomem_clear_status(smc);
+	iowrite32(key_int, smc->iomem_base + APPLESMC_IOMEM_KEY_NAME);
+	iowrite32(0, smc->iomem_base + APPLESMC_IOMEM_KEY_SMC_ID);
+	iowrite32(cmd, smc->iomem_base + APPLESMC_IOMEM_KEY_CMD);
+
+	if (iomem_wait_read(smc))
+		return -EIO;
+
+	err = ioread8(smc->iomem_base + APPLESMC_IOMEM_KEY_CMD);
+	if (err != 0) {
+		dev_warn(smc->ldev, "read_smc_mmio(%x %8x/%.4s) failed: %u\n",
+				cmd, key_int, key, err);
+		return -EIO;
+	}
+
+	if (cmd == APPLESMC_READ_CMD) {
+		remote_len = ioread8(smc->iomem_base + APPLESMC_IOMEM_KEY_DATA_LEN);
+		if (remote_len != len) {
+			dev_warn(smc->ldev,
+				 "read_smc_mmio(%x %8x/%.4s) failed: buffer length mismatch (remote = %u, requested = %u)\n",
+				 cmd, key_int, key, remote_len, len);
+			return -EINVAL;
+		}
+	} else {
+		remote_len = len;
+	}
+
+	memcpy_fromio(buffer, smc->iomem_base + APPLESMC_IOMEM_KEY_DATA,
+			remote_len);
+
+	dev_dbg(smc->ldev, "read_smc_mmio(%x %8x/%.4s): buflen=%u reslen=%u\n",
+			cmd, key_int, key, len, remote_len);
+	print_hex_dump_bytes("read_smc_mmio(): ", DUMP_PREFIX_NONE, buffer, remote_len);
+	return 0;
+}
+
+static int iomem_get_smc_key_type(struct applesmc_device *smc, const char *key,
+	struct applesmc_entry *e)
+{
+	u8 err;
+	u8 cmd = APPLESMC_GET_KEY_TYPE_CMD;
+	u32 key_int = *((u32 *) key);
+
+	iomem_clear_status(smc);
+	iowrite32(key_int, smc->iomem_base + APPLESMC_IOMEM_KEY_NAME);
+	iowrite32(0, smc->iomem_base + APPLESMC_IOMEM_KEY_SMC_ID);
+	iowrite32(cmd, smc->iomem_base + APPLESMC_IOMEM_KEY_CMD);
+
+	if (iomem_wait_read(smc))
+		return -EIO;
+
+	err = ioread8(smc->iomem_base + APPLESMC_IOMEM_KEY_CMD);
+	if (err != 0) {
+		dev_warn(smc->ldev, "get_smc_key_type_mmio(%.4s) failed: %u\n", key, err);
+		return -EIO;
+	}
+
+	e->len = ioread8(smc->iomem_base + APPLESMC_IOMEM_KEY_TYPE_DATA_LEN);
+	*((uint32_t *) e->type) = ioread32(
+			smc->iomem_base + APPLESMC_IOMEM_KEY_TYPE_CODE);
+	e->flags = ioread8(smc->iomem_base + APPLESMC_IOMEM_KEY_TYPE_FLAGS);
+
+	dev_dbg(smc->ldev, "get_smc_key_type_mmio(%.4s): len=%u type=%.4s flags=%x\n",
+		key, e->len, e->type, e->flags);
+	return 0;
+}
+
+static int iomem_write_smc(struct applesmc_device *smc, u8 cmd, const char *key,
+	const u8 *buffer, u8 len)
+{
+	u8 err;
+	u32 key_int = *((u32 *) key);
+
+	iomem_clear_status(smc);
+	iowrite32(key_int, smc->iomem_base + APPLESMC_IOMEM_KEY_NAME);
+	memcpy_toio(smc->iomem_base + APPLESMC_IOMEM_KEY_DATA, buffer, len);
+	iowrite32(len, smc->iomem_base + APPLESMC_IOMEM_KEY_DATA_LEN);
+	iowrite32(0, smc->iomem_base + APPLESMC_IOMEM_KEY_SMC_ID);
+	iowrite32(cmd, smc->iomem_base + APPLESMC_IOMEM_KEY_CMD);
+
+	if (iomem_wait_read(smc))
+		return -EIO;
+
+	err = ioread8(smc->iomem_base + APPLESMC_IOMEM_KEY_CMD);
+	if (err != 0) {
+		dev_warn(smc->ldev, "write_smc_mmio(%x %.4s) failed: %u\n", cmd, key, err);
+		print_hex_dump_bytes("write_smc_mmio(): ", DUMP_PREFIX_NONE, buffer, len);
+		return -EIO;
+	}
+
+	dev_dbg(smc->ldev, "write_smc_mmio(%x %.4s): buflen=%u\n", cmd, key, len);
+	print_hex_dump_bytes("write_smc_mmio(): ", DUMP_PREFIX_NONE, buffer, len);
+	return 0;
+}
+
+
+static int read_smc(struct applesmc_device *smc, const char *key,
+	u8 *buffer, u8 len)
+{
+	if (smc->iomem_base_set)
+		return iomem_read_smc(smc, APPLESMC_READ_CMD, key, buffer, len);
+	else
+		return port_read_smc(smc, APPLESMC_READ_CMD, key, buffer, len);
+}
+
+static int write_smc(struct applesmc_device *smc, const char *key,
+	const u8 *buffer, u8 len)
+{
+	if (smc->iomem_base_set)
+		return iomem_write_smc(smc, APPLESMC_WRITE_CMD, key, buffer, len);
+	else
+		return port_write_smc(smc, APPLESMC_WRITE_CMD, key, buffer, len);
+}
+
+static int get_smc_key_by_index(struct applesmc_device *smc,
+	unsigned int index, char *key)
+{
+	__be32 be;
+
+	be = cpu_to_be32(index);
+	if (smc->iomem_base_set)
+		return iomem_read_smc(smc, APPLESMC_GET_KEY_BY_INDEX_CMD,
+							  (const char *) &be, (u8 *) key, 4);
+	else
+		return port_read_smc(smc, APPLESMC_GET_KEY_BY_INDEX_CMD,
+							 (const char *) &be, (u8 *) key, 4);
+}
+
+static int get_smc_key_info(struct applesmc_device *smc, const char *key,
+	struct applesmc_entry *info)
+{
+	if (smc->iomem_base_set)
+		return iomem_get_smc_key_type(smc, key, info);
+	else
+		return port_get_smc_key_info(smc, key, info);
+}
+
+static int read_register_count(struct applesmc_device *smc,
+	unsigned int *count)
+{
+	__be32 be;
+	int ret;
+
+	ret = read_smc(smc, KEY_COUNT_KEY, (u8 *)&be, 4);
+	if (ret < 0)
+		return ret;
 
 	*count = be32_to_cpu(be);
 	return 0;
@@ -338,76 +560,73 @@ static int read_register_count(unsigned int *count)
  * All functions below are concurrency safe - callers should NOT hold lock.
  */
 
-static int applesmc_read_entry(const struct applesmc_entry *entry,
-			       u8 *buf, u8 len)
+static int applesmc_read_entry(struct applesmc_device *smc,
+	const struct applesmc_entry *entry, u8 *buf, u8 len)
 {
 	int ret;
 
 	if (entry->len != len)
 		return -EINVAL;
-	mutex_lock(&smcreg.mutex);
-	ret = read_smc(APPLESMC_READ_CMD, entry->key, buf, len);
-	mutex_unlock(&smcreg.mutex);
+	mutex_lock(&smc->reg.mutex);
+	ret = read_smc(smc, entry->key, buf, len);
+	mutex_unlock(&smc->reg.mutex);
 
 	return ret;
 }
 
-static int applesmc_write_entry(const struct applesmc_entry *entry,
-				const u8 *buf, u8 len)
+static int applesmc_write_entry(struct applesmc_device *smc,
+	const struct applesmc_entry *entry, const u8 *buf, u8 len)
 {
 	int ret;
 
 	if (entry->len != len)
 		return -EINVAL;
-	mutex_lock(&smcreg.mutex);
-	ret = write_smc(APPLESMC_WRITE_CMD, entry->key, buf, len);
-	mutex_unlock(&smcreg.mutex);
+	mutex_lock(&smc->reg.mutex);
+	ret = write_smc(smc, entry->key, buf, len);
+	mutex_unlock(&smc->reg.mutex);
 	return ret;
 }
 
-static const struct applesmc_entry *applesmc_get_entry_by_index(int index)
+static const struct applesmc_entry *applesmc_get_entry_by_index(
+	struct applesmc_device *smc, int index)
 {
-	struct applesmc_entry *cache = &smcreg.cache[index];
-	u8 key[4], info[6];
-	__be32 be;
+	struct applesmc_entry *cache = &smc->reg.cache[index];
+	char key[4];
 	int ret = 0;
 
 	if (cache->valid)
 		return cache;
 
-	mutex_lock(&smcreg.mutex);
+	mutex_lock(&smc->reg.mutex);
 
 	if (cache->valid)
 		goto out;
-	be = cpu_to_be32(index);
-	ret = read_smc(APPLESMC_GET_KEY_BY_INDEX_CMD, (u8 *)&be, key, 4);
+	ret = get_smc_key_by_index(smc, index, key);
 	if (ret)
 		goto out;
-	ret = read_smc(APPLESMC_GET_KEY_TYPE_CMD, key, info, 6);
+	memcpy(cache->key, key, 4);
+
+	ret = get_smc_key_info(smc, key, cache);
 	if (ret)
 		goto out;
-
-	memcpy(cache->key, key, 4);
-	cache->len = info[0];
-	memcpy(cache->type, &info[1], 4);
-	cache->flags = info[5];
 	cache->valid = true;
 
 out:
-	mutex_unlock(&smcreg.mutex);
+	mutex_unlock(&smc->reg.mutex);
 	if (ret)
 		return ERR_PTR(ret);
 	return cache;
 }
 
-static int applesmc_get_lower_bound(unsigned int *lo, const char *key)
+static int applesmc_get_lower_bound(struct applesmc_device *smc,
+	unsigned int *lo, const char *key)
 {
-	int begin = 0, end = smcreg.key_count;
+	int begin = 0, end = smc->reg.key_count;
 	const struct applesmc_entry *entry;
 
 	while (begin != end) {
 		int middle = begin + (end - begin) / 2;
-		entry = applesmc_get_entry_by_index(middle);
+		entry = applesmc_get_entry_by_index(smc, middle);
 		if (IS_ERR(entry)) {
 			*lo = 0;
 			return PTR_ERR(entry);
@@ -422,16 +641,17 @@ static int applesmc_get_lower_bound(unsigned int *lo, const char *key)
 	return 0;
 }
 
-static int applesmc_get_upper_bound(unsigned int *hi, const char *key)
+static int applesmc_get_upper_bound(struct applesmc_device *smc,
+	unsigned int *hi, const char *key)
 {
-	int begin = 0, end = smcreg.key_count;
+	int begin = 0, end = smc->reg.key_count;
 	const struct applesmc_entry *entry;
 
 	while (begin != end) {
 		int middle = begin + (end - begin) / 2;
-		entry = applesmc_get_entry_by_index(middle);
+		entry = applesmc_get_entry_by_index(smc, middle);
 		if (IS_ERR(entry)) {
-			*hi = smcreg.key_count;
+			*hi = smc->reg.key_count;
 			return PTR_ERR(entry);
 		}
 		if (strcmp(key, entry->key) < 0)
@@ -444,50 +664,54 @@ static int applesmc_get_upper_bound(unsigned int *hi, const char *key)
 	return 0;
 }
 
-static const struct applesmc_entry *applesmc_get_entry_by_key(const char *key)
+static const struct applesmc_entry *applesmc_get_entry_by_key(
+	struct applesmc_device *smc, const char *key)
 {
 	int begin, end;
 	int ret;
 
-	ret = applesmc_get_lower_bound(&begin, key);
+	ret = applesmc_get_lower_bound(smc, &begin, key);
 	if (ret)
 		return ERR_PTR(ret);
-	ret = applesmc_get_upper_bound(&end, key);
+	ret = applesmc_get_upper_bound(smc, &end, key);
 	if (ret)
 		return ERR_PTR(ret);
 	if (end - begin != 1)
 		return ERR_PTR(-EINVAL);
 
-	return applesmc_get_entry_by_index(begin);
+	return applesmc_get_entry_by_index(smc, begin);
 }
 
-static int applesmc_read_key(const char *key, u8 *buffer, u8 len)
+static int applesmc_read_key(struct applesmc_device *smc,
+	const char *key, u8 *buffer, u8 len)
 {
 	const struct applesmc_entry *entry;
 
-	entry = applesmc_get_entry_by_key(key);
+	entry = applesmc_get_entry_by_key(smc, key);
 	if (IS_ERR(entry))
 		return PTR_ERR(entry);
 
-	return applesmc_read_entry(entry, buffer, len);
+	return applesmc_read_entry(smc, entry, buffer, len);
 }
 
-static int applesmc_write_key(const char *key, const u8 *buffer, u8 len)
+static int applesmc_write_key(struct applesmc_device *smc,
+	const char *key, const u8 *buffer, u8 len)
 {
 	const struct applesmc_entry *entry;
 
-	entry = applesmc_get_entry_by_key(key);
+	entry = applesmc_get_entry_by_key(smc, key);
 	if (IS_ERR(entry))
 		return PTR_ERR(entry);
 
-	return applesmc_write_entry(entry, buffer, len);
+	return applesmc_write_entry(smc, entry, buffer, len);
 }
 
-static int applesmc_has_key(const char *key, bool *value)
+static int applesmc_has_key(struct applesmc_device *smc,
+	const char *key, bool *value)
 {
 	const struct applesmc_entry *entry;
 
-	entry = applesmc_get_entry_by_key(key);
+	entry = applesmc_get_entry_by_key(smc, key);
 	if (IS_ERR(entry) && PTR_ERR(entry) != -EINVAL)
 		return PTR_ERR(entry);
 
@@ -498,12 +722,13 @@ static int applesmc_has_key(const char *key, bool *value)
 /*
  * applesmc_read_s16 - Read 16-bit signed big endian register
  */
-static int applesmc_read_s16(const char *key, s16 *value)
+static int applesmc_read_s16(struct applesmc_device *smc,
+	const char *key, s16 *value)
 {
 	u8 buffer[2];
 	int ret;
 
-	ret = applesmc_read_key(key, buffer, 2);
+	ret = applesmc_read_key(smc, key, buffer, 2);
 	if (ret)
 		return ret;
 
@@ -511,31 +736,68 @@ static int applesmc_read_s16(const char *key, s16 *value)
 	return 0;
 }
 
+/**
+ * applesmc_float_to_u32 - Retrieve the integral part of a float.
+ * This is needed because Apple made fans use float values in the T2.
+ * The fractional point is not significantly useful though, and the integral
+ * part can be easily extracted.
+ */
+static inline u32 applesmc_float_to_u32(u32 d)
+{
+	u8 sign = (u8) ((d >> 31) & 1);
+	s32 exp = (s32) ((d >> 23) & 0xff) - 0x7f;
+	u32 fr = d & ((1u << 23) - 1);
+
+	if (sign || exp < 0)
+		return 0;
+
+	return (u32) ((1u << exp) + (fr >> (23 - exp)));
+}
+
+/**
+ * applesmc_u32_to_float - Convert an u32 into a float.
+ * See applesmc_float_to_u32 for a rationale.
+ */
+static inline u32 applesmc_u32_to_float(u32 d)
+{
+	u32 dc = d, bc = 0, exp;
+
+	if (!d)
+		return 0;
+
+	while (dc >>= 1)
+		++bc;
+	exp = 0x7f + bc;
+
+	return (u32) ((exp << 23) |
+		((d << (23 - (exp - 0x7f))) & ((1u << 23) - 1)));
+}
 /*
  * applesmc_device_init - initialize the accelerometer.  Can sleep.
  */
-static void applesmc_device_init(void)
+static void applesmc_device_init(struct applesmc_device *smc)
 {
 	int total;
 	u8 buffer[2];
 
-	if (!smcreg.has_accelerometer)
+	if (!smc->reg.has_accelerometer)
 		return;
 
 	for (total = INIT_TIMEOUT_MSECS; total > 0; total -= INIT_WAIT_MSECS) {
-		if (!applesmc_read_key(MOTION_SENSOR_KEY, buffer, 2) &&
+		if (!applesmc_read_key(smc, MOTION_SENSOR_KEY, buffer, 2) &&
 				(buffer[0] != 0x00 || buffer[1] != 0x00))
 			return;
 		buffer[0] = 0xe0;
 		buffer[1] = 0x00;
-		applesmc_write_key(MOTION_SENSOR_KEY, buffer, 2);
+		applesmc_write_key(smc, MOTION_SENSOR_KEY, buffer, 2);
 		msleep(INIT_WAIT_MSECS);
 	}
 
 	pr_warn("failed to init the device\n");
 }
 
-static int applesmc_init_index(struct applesmc_registers *s)
+static int applesmc_init_index(struct applesmc_device *smc,
+	struct applesmc_registers *s)
 {
 	const struct applesmc_entry *entry;
 	unsigned int i;
@@ -548,7 +810,7 @@ static int applesmc_init_index(struct applesmc_registers *s)
 		return -ENOMEM;
 
 	for (i = s->temp_begin; i < s->temp_end; i++) {
-		entry = applesmc_get_entry_by_index(i);
+		entry = applesmc_get_entry_by_index(smc, i);
 		if (IS_ERR(entry))
 			continue;
 		if (strcmp(entry->type, TEMP_SENSOR_TYPE))
@@ -562,9 +824,9 @@ static int applesmc_init_index(struct applesmc_registers *s)
 /*
  * applesmc_init_smcreg_try - Try to initialize register cache. Idempotent.
  */
-static int applesmc_init_smcreg_try(void)
+static int applesmc_init_smcreg_try(struct applesmc_device *smc)
 {
-	struct applesmc_registers *s = &smcreg;
+	struct applesmc_registers *s = &smc->reg;
 	bool left_light_sensor = false, right_light_sensor = false;
 	unsigned int count;
 	u8 tmp[1];
@@ -573,7 +835,7 @@ static int applesmc_init_smcreg_try(void)
 	if (s->init_complete)
 		return 0;
 
-	ret = read_register_count(&count);
+	ret = read_register_count(smc, &count);
 	if (ret)
 		return ret;
 
@@ -590,35 +852,35 @@ static int applesmc_init_smcreg_try(void)
 	if (!s->cache)
 		return -ENOMEM;
 
-	ret = applesmc_read_key(FANS_COUNT, tmp, 1);
+	ret = applesmc_read_key(smc, FANS_COUNT, tmp, 1);
 	if (ret)
 		return ret;
 	s->fan_count = tmp[0];
 	if (s->fan_count > 10)
 		s->fan_count = 10;
 
-	ret = applesmc_get_lower_bound(&s->temp_begin, "T");
+	ret = applesmc_get_lower_bound(smc, &s->temp_begin, "T");
 	if (ret)
 		return ret;
-	ret = applesmc_get_lower_bound(&s->temp_end, "U");
+	ret = applesmc_get_lower_bound(smc, &s->temp_end, "U");
 	if (ret)
 		return ret;
 	s->temp_count = s->temp_end - s->temp_begin;
 
-	ret = applesmc_init_index(s);
+	ret = applesmc_init_index(smc, s);
 	if (ret)
 		return ret;
 
-	ret = applesmc_has_key(LIGHT_SENSOR_LEFT_KEY, &left_light_sensor);
+	ret = applesmc_has_key(smc, LIGHT_SENSOR_LEFT_KEY, &left_light_sensor);
 	if (ret)
 		return ret;
-	ret = applesmc_has_key(LIGHT_SENSOR_RIGHT_KEY, &right_light_sensor);
+	ret = applesmc_has_key(smc, LIGHT_SENSOR_RIGHT_KEY, &right_light_sensor);
 	if (ret)
 		return ret;
-	ret = applesmc_has_key(MOTION_SENSOR_KEY, &s->has_accelerometer);
+	ret = applesmc_has_key(smc, MOTION_SENSOR_KEY, &s->has_accelerometer);
 	if (ret)
 		return ret;
-	ret = applesmc_has_key(BACKLIGHT_KEY, &s->has_key_backlight);
+	ret = applesmc_has_key(smc, BACKLIGHT_KEY, &s->has_key_backlight);
 	if (ret)
 		return ret;
 
@@ -634,13 +896,13 @@ static int applesmc_init_smcreg_try(void)
 	return 0;
 }
 
-static void applesmc_destroy_smcreg(void)
+static void applesmc_destroy_smcreg(struct applesmc_device *smc)
 {
-	kfree(smcreg.index);
-	smcreg.index = NULL;
-	kfree(smcreg.cache);
-	smcreg.cache = NULL;
-	smcreg.init_complete = false;
+	kfree(smc->reg.index);
+	smc->reg.index = NULL;
+	kfree(smc->reg.cache);
+	smc->reg.cache = NULL;
+	smc->reg.init_complete = false;
 }
 
 /*
@@ -649,12 +911,12 @@ static void applesmc_destroy_smcreg(void)
  * Retries until initialization is successful, or the operation times out.
  *
  */
-static int applesmc_init_smcreg(void)
+static int applesmc_init_smcreg(struct applesmc_device *smc)
 {
 	int ms, ret;
 
 	for (ms = 0; ms < INIT_TIMEOUT_MSECS; ms += INIT_WAIT_MSECS) {
-		ret = applesmc_init_smcreg_try();
+		ret = applesmc_init_smcreg_try(smc);
 		if (!ret) {
 			if (ms)
 				pr_info("init_smcreg() took %d ms\n", ms);
@@ -663,50 +925,223 @@ static int applesmc_init_smcreg(void)
 		msleep(INIT_WAIT_MSECS);
 	}
 
-	applesmc_destroy_smcreg();
+	applesmc_destroy_smcreg(smc);
 
 	return ret;
 }
 
 /* Device model stuff */
-static int applesmc_probe(struct platform_device *dev)
+
+static int applesmc_init_resources(struct applesmc_device *smc);
+static void applesmc_free_resources(struct applesmc_device *smc);
+static int applesmc_create_modules(struct applesmc_device *smc);
+static void applesmc_destroy_modules(struct applesmc_device *smc);
+
+static int applesmc_add(struct acpi_device *dev)
 {
+	struct applesmc_device *smc;
 	int ret;
 
-	ret = applesmc_init_smcreg();
+	smc = kzalloc(sizeof(struct applesmc_device), GFP_KERNEL);
+	if (!smc)
+		return -ENOMEM;
+	smc->dev = dev;
+	smc->ldev = &dev->dev;
+	mutex_init(&smc->reg.mutex);
+
+	dev_set_drvdata(&dev->dev, smc);
+
+	ret = applesmc_init_resources(smc);
 	if (ret)
-		return ret;
+		goto out_mem;
+
+	ret = applesmc_init_smcreg(smc);
+	if (ret)
+		goto out_res;
+
+	applesmc_device_init(smc);
+
+	ret = applesmc_create_modules(smc);
+	if (ret)
+		goto out_reg;
+
+	return 0;
+
+out_reg:
+	applesmc_destroy_smcreg(smc);
+out_res:
+	applesmc_free_resources(smc);
+out_mem:
+	dev_set_drvdata(&dev->dev, NULL);
+	mutex_destroy(&smc->reg.mutex);
+	kfree(smc);
+
+	return ret;
+}
+
+static void applesmc_remove(struct acpi_device *dev)
+{
+	struct applesmc_device *smc = dev_get_drvdata(&dev->dev);
+
+	applesmc_destroy_modules(smc);
+	applesmc_destroy_smcreg(smc);
+	applesmc_free_resources(smc);
 
-	applesmc_device_init();
+	mutex_destroy(&smc->reg.mutex);
+	kfree(smc);
+
+	return;
+}
+
+static acpi_status applesmc_walk_resources(struct acpi_resource *res,
+	void *data)
+{
+	struct applesmc_device *smc = data;
+
+	switch (res->type) {
+	case ACPI_RESOURCE_TYPE_IO:
+		if (!smc->port_base_set) {
+			if (res->data.io.address_length < APPLESMC_NR_PORTS)
+				return AE_ERROR;
+			smc->port_base = res->data.io.minimum;
+			smc->port_base_set = true;
+		}
+		return AE_OK;
+
+	case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
+		if (!smc->iomem_base_set) {
+			if (res->data.fixed_memory32.address_length <
+					APPLESMC_IOMEM_MIN_SIZE) {
+				dev_warn(smc->ldev, "found iomem but it's too small: %u\n",
+						 res->data.fixed_memory32.address_length);
+				return AE_OK;
+			}
+			smc->iomem_base_addr = res->data.fixed_memory32.address;
+			smc->iomem_base_size = res->data.fixed_memory32.address_length;
+			smc->iomem_base_set = true;
+		}
+		return AE_OK;
+
+	case ACPI_RESOURCE_TYPE_END_TAG:
+		if (smc->port_base_set)
+			return AE_OK;
+		else
+			return AE_NOT_FOUND;
+
+	default:
+		return AE_OK;
+	}
+}
+
+static int applesmc_try_enable_iomem(struct applesmc_device *smc);
+
+static int applesmc_init_resources(struct applesmc_device *smc)
+{
+	int ret;
+
+	ret = acpi_walk_resources(smc->dev->handle, METHOD_NAME__CRS,
+			applesmc_walk_resources, smc);
+	if (ACPI_FAILURE(ret))
+		return -ENXIO;
+
+	if (!request_region(smc->port_base, APPLESMC_NR_PORTS, "applesmc"))
+		return -ENXIO;
+
+	if (smc->iomem_base_set) {
+		if (applesmc_try_enable_iomem(smc))
+			smc->iomem_base_set = false;
+	}
+
+	return 0;
+}
+
+static int applesmc_try_enable_iomem(struct applesmc_device *smc)
+{
+	u8 test_val, ldkn_version;
+
+	dev_dbg(smc->ldev, "Trying to enable iomem based communication\n");
+	smc->iomem_base = ioremap(smc->iomem_base_addr, smc->iomem_base_size);
+	if (!smc->iomem_base)
+		goto out;
+
+	/* Apple's driver does this check for some reason */
+	test_val = ioread8(smc->iomem_base + APPLESMC_IOMEM_KEY_STATUS);
+	if (test_val == 0xff) {
+		dev_warn(smc->ldev,
+			 "iomem enable failed: initial status is 0xff (is %x)\n",
+			 test_val);
+		goto out_iomem;
+	}
+
+	if (read_smc(smc, "LDKN", &ldkn_version, 1)) {
+		dev_warn(smc->ldev, "iomem enable failed: ldkn read failed\n");
+		goto out_iomem;
+	}
+
+	if (ldkn_version < 2) {
+		dev_warn(smc->ldev,
+			 "iomem enable failed: ldkn version %u is less than minimum (2)\n",
+			 ldkn_version);
+		goto out_iomem;
+	}
 
 	return 0;
+
+out_iomem:
+	iounmap(smc->iomem_base);
+
+out:
+	return -ENXIO;
+}
+
+static void applesmc_free_resources(struct applesmc_device *smc)
+{
+	if (smc->iomem_base_set)
+		iounmap(smc->iomem_base);
+	release_region(smc->port_base, APPLESMC_NR_PORTS);
 }
 
 /* Synchronize device with memorized backlight state */
 static int applesmc_pm_resume(struct device *dev)
 {
-	if (smcreg.has_key_backlight)
-		applesmc_write_key(BACKLIGHT_KEY, backlight_state, 2);
+	struct applesmc_device *smc = dev_get_drvdata(dev);
+
+	if (smc->reg.has_key_backlight)
+		applesmc_write_key(smc, BACKLIGHT_KEY, smc->backlight_state, 2);
+
 	return 0;
 }
 
 /* Reinitialize device on resume from hibernation */
 static int applesmc_pm_restore(struct device *dev)
 {
-	applesmc_device_init();
+	struct applesmc_device *smc = dev_get_drvdata(dev);
+
+	applesmc_device_init(smc);
+
 	return applesmc_pm_resume(dev);
 }
 
+static const struct acpi_device_id applesmc_ids[] = {
+	{"APP0001", 0},
+	{"", 0},
+};
+
 static const struct dev_pm_ops applesmc_pm_ops = {
 	.resume = applesmc_pm_resume,
 	.restore = applesmc_pm_restore,
 };
 
-static struct platform_driver applesmc_driver = {
-	.probe = applesmc_probe,
-	.driver	= {
-		.name = "applesmc",
-		.pm = &applesmc_pm_ops,
+static struct acpi_driver applesmc_driver = {
+	.name = "applesmc",
+	.class = "applesmc",
+	.ids = applesmc_ids,
+	.ops = {
+		.add = applesmc_add,
+		.remove = applesmc_remove
+	},
+	.drv = {
+		.pm = &applesmc_pm_ops
 	},
 };
 
@@ -714,25 +1149,26 @@ static struct platform_driver applesmc_driver = {
  * applesmc_calibrate - Set our "resting" values.  Callers must
  * hold applesmc_lock.
  */
-static void applesmc_calibrate(void)
+static void applesmc_calibrate(struct applesmc_device *smc)
 {
-	applesmc_read_s16(MOTION_SENSOR_X_KEY, &rest_x);
-	applesmc_read_s16(MOTION_SENSOR_Y_KEY, &rest_y);
-	rest_x = -rest_x;
+	applesmc_read_s16(smc, MOTION_SENSOR_X_KEY, &smc->rest_x);
+	applesmc_read_s16(smc, MOTION_SENSOR_Y_KEY, &smc->rest_y);
+	smc->rest_x = -smc->rest_x;
 }
 
 static void applesmc_idev_poll(struct input_dev *idev)
 {
+	struct applesmc_device *smc = dev_get_drvdata(&idev->dev);
 	s16 x, y;
 
-	if (applesmc_read_s16(MOTION_SENSOR_X_KEY, &x))
+	if (applesmc_read_s16(smc, MOTION_SENSOR_X_KEY, &x))
 		return;
-	if (applesmc_read_s16(MOTION_SENSOR_Y_KEY, &y))
+	if (applesmc_read_s16(smc, MOTION_SENSOR_Y_KEY, &y))
 		return;
 
 	x = -x;
-	input_report_abs(idev, ABS_X, x - rest_x);
-	input_report_abs(idev, ABS_Y, y - rest_y);
+	input_report_abs(idev, ABS_X, x - smc->rest_x);
+	input_report_abs(idev, ABS_Y, y - smc->rest_y);
 	input_sync(idev);
 }
 
@@ -747,16 +1183,17 @@ static ssize_t applesmc_name_show(struct device *dev,
 static ssize_t applesmc_position_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
+	struct applesmc_device *smc = dev_get_drvdata(dev);
 	int ret;
 	s16 x, y, z;
 
-	ret = applesmc_read_s16(MOTION_SENSOR_X_KEY, &x);
+	ret = applesmc_read_s16(smc, MOTION_SENSOR_X_KEY, &x);
 	if (ret)
 		goto out;
-	ret = applesmc_read_s16(MOTION_SENSOR_Y_KEY, &y);
+	ret = applesmc_read_s16(smc, MOTION_SENSOR_Y_KEY, &y);
 	if (ret)
 		goto out;
-	ret = applesmc_read_s16(MOTION_SENSOR_Z_KEY, &z);
+	ret = applesmc_read_s16(smc, MOTION_SENSOR_Z_KEY, &z);
 	if (ret)
 		goto out;
 
@@ -770,6 +1207,7 @@ static ssize_t applesmc_position_show(struct device *dev,
 static ssize_t applesmc_light_show(struct device *dev,
 				struct device_attribute *attr, char *sysfsbuf)
 {
+	struct applesmc_device *smc = dev_get_drvdata(dev);
 	const struct applesmc_entry *entry;
 	static int data_length;
 	int ret;
@@ -777,7 +1215,7 @@ static ssize_t applesmc_light_show(struct device *dev,
 	u8 buffer[10];
 
 	if (!data_length) {
-		entry = applesmc_get_entry_by_key(LIGHT_SENSOR_LEFT_KEY);
+		entry = applesmc_get_entry_by_key(smc, LIGHT_SENSOR_LEFT_KEY);
 		if (IS_ERR(entry))
 			return PTR_ERR(entry);
 		if (entry->len > 10)
@@ -786,7 +1224,7 @@ static ssize_t applesmc_light_show(struct device *dev,
 		pr_info("light sensor data length set to %d\n", data_length);
 	}
 
-	ret = applesmc_read_key(LIGHT_SENSOR_LEFT_KEY, buffer, data_length);
+	ret = applesmc_read_key(smc, LIGHT_SENSOR_LEFT_KEY, buffer, data_length);
 	if (ret)
 		goto out;
 	/* newer macbooks report a single 10-bit bigendian value */
@@ -796,7 +1234,7 @@ static ssize_t applesmc_light_show(struct device *dev,
 	}
 	left = buffer[2];
 
-	ret = applesmc_read_key(LIGHT_SENSOR_RIGHT_KEY, buffer, data_length);
+	ret = applesmc_read_key(smc, LIGHT_SENSOR_RIGHT_KEY, buffer, data_length);
 	if (ret)
 		goto out;
 	right = buffer[2];
@@ -812,7 +1250,8 @@ static ssize_t applesmc_light_show(struct device *dev,
 static ssize_t applesmc_show_sensor_label(struct device *dev,
 			struct device_attribute *devattr, char *sysfsbuf)
 {
-	const char *key = smcreg.index[to_index(devattr)];
+	struct applesmc_device *smc = dev_get_drvdata(dev);
+	const char *key = smc->reg.index[to_index(devattr)];
 
 	return sysfs_emit(sysfsbuf, "%s\n", key);
 }
@@ -821,12 +1260,13 @@ static ssize_t applesmc_show_sensor_label(struct device *dev,
 static ssize_t applesmc_show_temperature(struct device *dev,
 			struct device_attribute *devattr, char *sysfsbuf)
 {
-	const char *key = smcreg.index[to_index(devattr)];
+	struct applesmc_device *smc = dev_get_drvdata(dev);
+	const char *key = smc->reg.index[to_index(devattr)];
 	int ret;
 	s16 value;
 	int temp;
 
-	ret = applesmc_read_s16(key, &value);
+	ret = applesmc_read_s16(smc, key, &value);
 	if (ret)
 		return ret;
 
@@ -838,6 +1278,8 @@ static ssize_t applesmc_show_temperature(struct device *dev,
 static ssize_t applesmc_show_fan_speed(struct device *dev,
 				struct device_attribute *attr, char *sysfsbuf)
 {
+	struct applesmc_device *smc = dev_get_drvdata(dev);
+	const struct applesmc_entry *entry;
 	int ret;
 	unsigned int speed = 0;
 	char newkey[5];
@@ -846,11 +1288,21 @@ static ssize_t applesmc_show_fan_speed(struct device *dev,
 	scnprintf(newkey, sizeof(newkey), fan_speed_fmt[to_option(attr)],
 		  to_index(attr));
 
-	ret = applesmc_read_key(newkey, buffer, 2);
+	entry = applesmc_get_entry_by_key(smc, newkey);
+	if (IS_ERR(entry))
+		return PTR_ERR(entry);
+
+	if (!strcmp(entry->type, FLOAT_TYPE)) {
+		ret = applesmc_read_entry(smc, entry, (u8 *) &speed, 4);
+		speed = applesmc_float_to_u32(speed);
+	} else {
+		ret = applesmc_read_entry(smc, entry, buffer, 2);
+		speed = ((buffer[0] << 8 | buffer[1]) >> 2);
+	}
+
 	if (ret)
 		return ret;
 
-	speed = ((buffer[0] << 8 | buffer[1]) >> 2);
 	return sysfs_emit(sysfsbuf, "%u\n", speed);
 }
 
@@ -858,6 +1310,8 @@ static ssize_t applesmc_store_fan_speed(struct device *dev,
 					struct device_attribute *attr,
 					const char *sysfsbuf, size_t count)
 {
+	struct applesmc_device *smc = dev_get_drvdata(dev);
+	const struct applesmc_entry *entry;
 	int ret;
 	unsigned long speed;
 	char newkey[5];
@@ -869,9 +1323,18 @@ static ssize_t applesmc_store_fan_speed(struct device *dev,
 	scnprintf(newkey, sizeof(newkey), fan_speed_fmt[to_option(attr)],
 		  to_index(attr));
 
-	buffer[0] = (speed >> 6) & 0xff;
-	buffer[1] = (speed << 2) & 0xff;
-	ret = applesmc_write_key(newkey, buffer, 2);
+	entry = applesmc_get_entry_by_key(smc, newkey);
+	if (IS_ERR(entry))
+		return PTR_ERR(entry);
+
+	if (!strcmp(entry->type, FLOAT_TYPE)) {
+		speed = applesmc_u32_to_float(speed);
+		ret = applesmc_write_entry(smc, entry, (u8 *) &speed, 4);
+	} else {
+		buffer[0] = (speed >> 6) & 0xff;
+		buffer[1] = (speed << 2) & 0xff;
+		ret = applesmc_write_key(smc, newkey, buffer, 2);
+	}
 
 	if (ret)
 		return ret;
@@ -882,15 +1345,30 @@ static ssize_t applesmc_store_fan_speed(struct device *dev,
 static ssize_t applesmc_show_fan_manual(struct device *dev,
 			struct device_attribute *attr, char *sysfsbuf)
 {
+	struct applesmc_device *smc = dev_get_drvdata(dev);
 	int ret;
 	u16 manual = 0;
 	u8 buffer[2];
+	char newkey[5];
+	bool has_newkey = false;
+
+	scnprintf(newkey, sizeof(newkey), FAN_MANUAL_FMT, to_index(attr));
+
+	ret = applesmc_has_key(smc, newkey, &has_newkey);
+	if (ret)
+		return ret;
+
+	if (has_newkey) {
+		ret = applesmc_read_key(smc, newkey, buffer, 1);
+		manual = buffer[0];
+	} else {
+		ret = applesmc_read_key(smc, FANS_MANUAL, buffer, 2);
+		manual = ((buffer[0] << 8 | buffer[1]) >> to_index(attr)) & 0x01;
+	}
 
-	ret = applesmc_read_key(FANS_MANUAL, buffer, 2);
 	if (ret)
 		return ret;
 
-	manual = ((buffer[0] << 8 | buffer[1]) >> to_index(attr)) & 0x01;
 	return sysfs_emit(sysfsbuf, "%d\n", manual);
 }
 
@@ -898,29 +1376,42 @@ static ssize_t applesmc_store_fan_manual(struct device *dev,
 					 struct device_attribute *attr,
 					 const char *sysfsbuf, size_t count)
 {
+	struct applesmc_device *smc = dev_get_drvdata(dev);
 	int ret;
 	u8 buffer[2];
+	char newkey[5];
+	bool has_newkey = false;
 	unsigned long input;
 	u16 val;
 
 	if (kstrtoul(sysfsbuf, 10, &input) < 0)
 		return -EINVAL;
 
-	ret = applesmc_read_key(FANS_MANUAL, buffer, 2);
+	scnprintf(newkey, sizeof(newkey), FAN_MANUAL_FMT, to_index(attr));
+
+	ret = applesmc_has_key(smc, newkey, &has_newkey);
 	if (ret)
-		goto out;
+		return ret;
 
-	val = (buffer[0] << 8 | buffer[1]);
+	if (has_newkey) {
+		buffer[0] = input & 1;
+		ret = applesmc_write_key(smc, newkey, buffer, 1);
+	} else {
+		ret = applesmc_read_key(smc, FANS_MANUAL, buffer, 2);
+		val = (buffer[0] << 8 | buffer[1]);
+		if (ret)
+			goto out;
 
-	if (input)
-		val = val | (0x01 << to_index(attr));
-	else
-		val = val & ~(0x01 << to_index(attr));
+		if (input)
+			val = val | (0x01 << to_index(attr));
+		else
+			val = val & ~(0x01 << to_index(attr));
 
-	buffer[0] = (val >> 8) & 0xFF;
-	buffer[1] = val & 0xFF;
+		buffer[0] = (val >> 8) & 0xFF;
+		buffer[1] = val & 0xFF;
 
-	ret = applesmc_write_key(FANS_MANUAL, buffer, 2);
+		ret = applesmc_write_key(smc, FANS_MANUAL, buffer, 2);
+	}
 
 out:
 	if (ret)
@@ -932,13 +1423,14 @@ static ssize_t applesmc_store_fan_manual(struct device *dev,
 static ssize_t applesmc_show_fan_position(struct device *dev,
 				struct device_attribute *attr, char *sysfsbuf)
 {
+	struct applesmc_device *smc = dev_get_drvdata(dev);
 	int ret;
 	char newkey[5];
 	u8 buffer[17];
 
 	scnprintf(newkey, sizeof(newkey), FAN_ID_FMT, to_index(attr));
 
-	ret = applesmc_read_key(newkey, buffer, 16);
+	ret = applesmc_read_key(smc, newkey, buffer, 16);
 	buffer[16] = 0;
 
 	if (ret)
@@ -950,43 +1442,79 @@ static ssize_t applesmc_show_fan_position(struct device *dev,
 static ssize_t applesmc_calibrate_show(struct device *dev,
 				struct device_attribute *attr, char *sysfsbuf)
 {
-	return sysfs_emit(sysfsbuf, "(%d,%d)\n", rest_x, rest_y);
+	struct applesmc_device *smc = dev_get_drvdata(dev);
+
+	return sysfs_emit(sysfsbuf, "(%d,%d)\n", smc->rest_x, smc->rest_y);
 }
 
 static ssize_t applesmc_calibrate_store(struct device *dev,
 	struct device_attribute *attr, const char *sysfsbuf, size_t count)
 {
-	applesmc_calibrate();
+	struct applesmc_device *smc = dev_get_drvdata(dev);
+
+	applesmc_calibrate(smc);
 
 	return count;
 }
 
 static void applesmc_backlight_set(struct work_struct *work)
 {
-	applesmc_write_key(BACKLIGHT_KEY, backlight_state, 2);
+	struct applesmc_device *smc = container_of(work, struct applesmc_device, backlight_work);
+
+	applesmc_write_key(smc, BACKLIGHT_KEY, smc->backlight_state, 2);
 }
-static DECLARE_WORK(backlight_work, &applesmc_backlight_set);
 
 static void applesmc_brightness_set(struct led_classdev *led_cdev,
 						enum led_brightness value)
 {
+	struct applesmc_device *smc = dev_get_drvdata(led_cdev->dev);
 	int ret;
 
-	backlight_state[0] = value;
-	ret = queue_work(applesmc_led_wq, &backlight_work);
+	smc->backlight_state[0] = value;
+	ret = queue_work(smc->backlight_wq, &smc->backlight_work);
 
 	if (debug && (!ret))
 		dev_dbg(led_cdev->dev, "work was already on the queue.\n");
 }
 
+static ssize_t applesmc_BCLM_store(struct device *dev,
+		struct device_attribute *attr, char *sysfsbuf, size_t count)
+{
+	struct applesmc_device *smc = dev_get_drvdata(dev);
+	u8 val;
+
+	if (kstrtou8(sysfsbuf, 10, &val) < 0)
+		return -EINVAL;
+
+	if (val < 0 || val > 100)
+		return -EINVAL;
+
+	if (applesmc_write_key(smc, "BCLM", &val, 1))
+		return -ENODEV;
+	return count;
+}
+
+static ssize_t applesmc_BCLM_show(struct device *dev,
+		struct device_attribute *attr, char *sysfsbuf)
+{
+	struct applesmc_device *smc = dev_get_drvdata(dev);
+	u8 val;
+
+	if (applesmc_read_key(smc, "BCLM", &val, 1))
+		return -ENODEV;
+
+	return sysfs_emit(sysfsbuf, "%d\n", val);
+}
+
 static ssize_t applesmc_key_count_show(struct device *dev,
 				struct device_attribute *attr, char *sysfsbuf)
 {
+	struct applesmc_device *smc = dev_get_drvdata(dev);
 	int ret;
 	u8 buffer[4];
 	u32 count;
 
-	ret = applesmc_read_key(KEY_COUNT_KEY, buffer, 4);
+	ret = applesmc_read_key(smc, KEY_COUNT_KEY, buffer, 4);
 	if (ret)
 		return ret;
 
@@ -998,13 +1526,14 @@ static ssize_t applesmc_key_count_show(struct device *dev,
 static ssize_t applesmc_key_at_index_read_show(struct device *dev,
 				struct device_attribute *attr, char *sysfsbuf)
 {
+	struct applesmc_device *smc = dev_get_drvdata(dev);
 	const struct applesmc_entry *entry;
 	int ret;
 
-	entry = applesmc_get_entry_by_index(key_at_index);
+	entry = applesmc_get_entry_by_index(smc, smc->key_at_index);
 	if (IS_ERR(entry))
 		return PTR_ERR(entry);
-	ret = applesmc_read_entry(entry, sysfsbuf, entry->len);
+	ret = applesmc_read_entry(smc, entry, sysfsbuf, entry->len);
 	if (ret)
 		return ret;
 
@@ -1014,9 +1543,10 @@ static ssize_t applesmc_key_at_index_read_show(struct device *dev,
 static ssize_t applesmc_key_at_index_data_length_show(struct device *dev,
 				struct device_attribute *attr, char *sysfsbuf)
 {
+	struct applesmc_device *smc = dev_get_drvdata(dev);
 	const struct applesmc_entry *entry;
 
-	entry = applesmc_get_entry_by_index(key_at_index);
+	entry = applesmc_get_entry_by_index(smc, smc->key_at_index);
 	if (IS_ERR(entry))
 		return PTR_ERR(entry);
 
@@ -1026,9 +1556,10 @@ static ssize_t applesmc_key_at_index_data_length_show(struct device *dev,
 static ssize_t applesmc_key_at_index_type_show(struct device *dev,
 				struct device_attribute *attr, char *sysfsbuf)
 {
+	struct applesmc_device *smc = dev_get_drvdata(dev);
 	const struct applesmc_entry *entry;
 
-	entry = applesmc_get_entry_by_index(key_at_index);
+	entry = applesmc_get_entry_by_index(smc, smc->key_at_index);
 	if (IS_ERR(entry))
 		return PTR_ERR(entry);
 
@@ -1038,9 +1569,10 @@ static ssize_t applesmc_key_at_index_type_show(struct device *dev,
 static ssize_t applesmc_key_at_index_name_show(struct device *dev,
 				struct device_attribute *attr, char *sysfsbuf)
 {
+	struct applesmc_device *smc = dev_get_drvdata(dev);
 	const struct applesmc_entry *entry;
 
-	entry = applesmc_get_entry_by_index(key_at_index);
+	entry = applesmc_get_entry_by_index(smc, smc->key_at_index);
 	if (IS_ERR(entry))
 		return PTR_ERR(entry);
 
@@ -1050,28 +1582,25 @@ static ssize_t applesmc_key_at_index_name_show(struct device *dev,
 static ssize_t applesmc_key_at_index_show(struct device *dev,
 				struct device_attribute *attr, char *sysfsbuf)
 {
-	return sysfs_emit(sysfsbuf, "%d\n", key_at_index);
+	struct applesmc_device *smc = dev_get_drvdata(dev);
+
+	return sysfs_emit(sysfsbuf, "%d\n", smc->key_at_index);
 }
 
 static ssize_t applesmc_key_at_index_store(struct device *dev,
 	struct device_attribute *attr, const char *sysfsbuf, size_t count)
 {
+	struct applesmc_device *smc = dev_get_drvdata(dev);
 	unsigned long newkey;
 
 	if (kstrtoul(sysfsbuf, 10, &newkey) < 0
-	    || newkey >= smcreg.key_count)
+	    || newkey >= smc->reg.key_count)
 		return -EINVAL;
 
-	key_at_index = newkey;
+	smc->key_at_index = newkey;
 	return count;
 }
 
-static struct led_classdev applesmc_backlight = {
-	.name			= "smc::kbd_backlight",
-	.default_trigger	= "nand-disk",
-	.brightness_set		= applesmc_brightness_set,
-};
-
 static struct applesmc_node_group info_group[] = {
 	{ "name", applesmc_name_show },
 	{ "key_count", applesmc_key_count_show },
@@ -1111,19 +1640,25 @@ static struct applesmc_node_group temp_group[] = {
 	{ }
 };
 
+static struct applesmc_node_group BCLM_group[] = {
+	{ "battery_charge_limit", applesmc_BCLM_show, applesmc_BCLM_store },
+	{ }
+};
+
 /* Module stuff */
 
 /*
  * applesmc_destroy_nodes - remove files and free associated memory
  */
-static void applesmc_destroy_nodes(struct applesmc_node_group *groups)
+static void applesmc_destroy_nodes(struct applesmc_device *smc,
+	struct applesmc_node_group *groups)
 {
 	struct applesmc_node_group *grp;
 	struct applesmc_dev_attr *node;
 
 	for (grp = groups; grp->nodes; grp++) {
 		for (node = grp->nodes; node->sda.dev_attr.attr.name; node++)
-			sysfs_remove_file(&pdev->dev.kobj,
+			sysfs_remove_file(&smc->dev->dev.kobj,
 					  &node->sda.dev_attr.attr);
 		kfree(grp->nodes);
 		grp->nodes = NULL;
@@ -1133,7 +1668,8 @@ static void applesmc_destroy_nodes(struct applesmc_node_group *groups)
 /*
  * applesmc_create_nodes - create a two-dimensional group of sysfs files
  */
-static int applesmc_create_nodes(struct applesmc_node_group *groups, int num)
+static int applesmc_create_nodes(struct applesmc_device *smc,
+	struct applesmc_node_group *groups, int num)
 {
 	struct applesmc_node_group *grp;
 	struct applesmc_dev_attr *node;
@@ -1157,7 +1693,7 @@ static int applesmc_create_nodes(struct applesmc_node_group *groups, int num)
 			sysfs_attr_init(attr);
 			attr->name = node->name;
 			attr->mode = 0444 | (grp->store ? 0200 : 0);
-			ret = sysfs_create_file(&pdev->dev.kobj, attr);
+			ret = sysfs_create_file(&smc->dev->dev.kobj, attr);
 			if (ret) {
 				attr->name = NULL;
 				goto out;
@@ -1167,57 +1703,56 @@ static int applesmc_create_nodes(struct applesmc_node_group *groups, int num)
 
 	return 0;
 out:
-	applesmc_destroy_nodes(groups);
+	applesmc_destroy_nodes(smc, groups);
 	return ret;
 }
 
 /* Create accelerometer resources */
-static int applesmc_create_accelerometer(void)
+static int applesmc_create_accelerometer(struct applesmc_device *smc)
 {
 	int ret;
-
-	if (!smcreg.has_accelerometer)
+	if (!smc->reg.has_accelerometer)
 		return 0;
 
-	ret = applesmc_create_nodes(accelerometer_group, 1);
+	ret = applesmc_create_nodes(smc, accelerometer_group, 1);
 	if (ret)
 		goto out;
 
-	applesmc_idev = input_allocate_device();
-	if (!applesmc_idev) {
+	smc->idev = input_allocate_device();
+	if (!smc->idev) {
 		ret = -ENOMEM;
 		goto out_sysfs;
 	}
 
 	/* initial calibrate for the input device */
-	applesmc_calibrate();
+	applesmc_calibrate(smc);
 
 	/* initialize the input device */
-	applesmc_idev->name = "applesmc";
-	applesmc_idev->id.bustype = BUS_HOST;
-	applesmc_idev->dev.parent = &pdev->dev;
-	input_set_abs_params(applesmc_idev, ABS_X,
+	smc->idev->name = "applesmc";
+	smc->idev->id.bustype = BUS_HOST;
+	smc->idev->dev.parent = &smc->dev->dev;
+	input_set_abs_params(smc->idev, ABS_X,
 			-256, 256, APPLESMC_INPUT_FUZZ, APPLESMC_INPUT_FLAT);
-	input_set_abs_params(applesmc_idev, ABS_Y,
+	input_set_abs_params(smc->idev, ABS_Y,
 			-256, 256, APPLESMC_INPUT_FUZZ, APPLESMC_INPUT_FLAT);
 
-	ret = input_setup_polling(applesmc_idev, applesmc_idev_poll);
+	ret = input_setup_polling(smc->idev, applesmc_idev_poll);
 	if (ret)
 		goto out_idev;
 
-	input_set_poll_interval(applesmc_idev, APPLESMC_POLL_INTERVAL);
+	input_set_poll_interval(smc->idev, APPLESMC_POLL_INTERVAL);
 
-	ret = input_register_device(applesmc_idev);
+	ret = input_register_device(smc->idev);
 	if (ret)
 		goto out_idev;
 
 	return 0;
 
 out_idev:
-	input_free_device(applesmc_idev);
+	input_free_device(smc->idev);
 
 out_sysfs:
-	applesmc_destroy_nodes(accelerometer_group);
+	applesmc_destroy_nodes(smc, accelerometer_group);
 
 out:
 	pr_warn("driver init failed (ret=%d)!\n", ret);
@@ -1225,44 +1760,55 @@ static int applesmc_create_accelerometer(void)
 }
 
 /* Release all resources used by the accelerometer */
-static void applesmc_release_accelerometer(void)
+static void applesmc_release_accelerometer(struct applesmc_device *smc)
 {
-	if (!smcreg.has_accelerometer)
+	if (!smc->reg.has_accelerometer)
 		return;
-	input_unregister_device(applesmc_idev);
-	applesmc_destroy_nodes(accelerometer_group);
+	input_unregister_device(smc->idev);
+	applesmc_destroy_nodes(smc, accelerometer_group);
 }
 
-static int applesmc_create_light_sensor(void)
+static int applesmc_create_light_sensor(struct applesmc_device *smc)
 {
-	if (!smcreg.num_light_sensors)
+	if (!smc->reg.num_light_sensors)
 		return 0;
-	return applesmc_create_nodes(light_sensor_group, 1);
+	return applesmc_create_nodes(smc, light_sensor_group, 1);
 }
 
-static void applesmc_release_light_sensor(void)
+static void applesmc_release_light_sensor(struct applesmc_device *smc)
 {
-	if (!smcreg.num_light_sensors)
+	if (!smc->reg.num_light_sensors)
 		return;
-	applesmc_destroy_nodes(light_sensor_group);
+	applesmc_destroy_nodes(smc, light_sensor_group);
 }
 
-static int applesmc_create_key_backlight(void)
+static int applesmc_create_key_backlight(struct applesmc_device *smc)
 {
-	if (!smcreg.has_key_backlight)
+	int ret;
+
+	if (!smc->reg.has_key_backlight)
 		return 0;
-	applesmc_led_wq = create_singlethread_workqueue("applesmc-led");
-	if (!applesmc_led_wq)
+	smc->backlight_wq = create_singlethread_workqueue("applesmc-led");
+	if (!smc->backlight_wq)
 		return -ENOMEM;
-	return led_classdev_register(&pdev->dev, &applesmc_backlight);
+
+	INIT_WORK(&smc->backlight_work, applesmc_backlight_set);
+	smc->backlight_dev.name = "smc::kbd_backlight";
+	smc->backlight_dev.default_trigger = "nand-disk";
+	smc->backlight_dev.brightness_set = applesmc_brightness_set;
+	ret = led_classdev_register(&smc->dev->dev, &smc->backlight_dev);
+	if (ret)
+		destroy_workqueue(smc->backlight_wq);
+
+	return ret;
 }
 
-static void applesmc_release_key_backlight(void)
+static void applesmc_release_key_backlight(struct applesmc_device *smc)
 {
-	if (!smcreg.has_key_backlight)
+	if (!smc->reg.has_key_backlight)
 		return;
-	led_classdev_unregister(&applesmc_backlight);
-	destroy_workqueue(applesmc_led_wq);
+	led_classdev_unregister(&smc->backlight_dev);
+	destroy_workqueue(smc->backlight_wq);
 }
 
 static int applesmc_dmi_match(const struct dmi_system_id *id)
@@ -1291,6 +1837,10 @@ static const struct dmi_system_id applesmc_whitelist[] __initconst = {
 	  DMI_MATCH(DMI_BOARD_VENDOR, "Apple"),
 	  DMI_MATCH(DMI_PRODUCT_NAME, "Macmini") },
 	},
+	{ applesmc_dmi_match, "Apple iMacPro", {
+	  DMI_MATCH(DMI_BOARD_VENDOR, "Apple"),
+	  DMI_MATCH(DMI_PRODUCT_NAME, "iMacPro") },
+	},
 	{ applesmc_dmi_match, "Apple MacPro", {
 	  DMI_MATCH(DMI_BOARD_VENDOR, "Apple"),
 	  DMI_MATCH(DMI_PRODUCT_NAME, "MacPro") },
@@ -1306,90 +1856,91 @@ static const struct dmi_system_id applesmc_whitelist[] __initconst = {
 	{ .ident = NULL }
 };
 
-static int __init applesmc_init(void)
+static int applesmc_create_modules(struct applesmc_device *smc)
 {
 	int ret;
 
-	if (!dmi_check_system(applesmc_whitelist)) {
-		pr_warn("supported laptop not found!\n");
-		ret = -ENODEV;
-		goto out;
-	}
-
-	if (!request_region(APPLESMC_DATA_PORT, APPLESMC_NR_PORTS,
-								"applesmc")) {
-		ret = -ENXIO;
-		goto out;
-	}
-
-	ret = platform_driver_register(&applesmc_driver);
-	if (ret)
-		goto out_region;
-
-	pdev = platform_device_register_simple("applesmc", APPLESMC_DATA_PORT,
-					       NULL, 0);
-	if (IS_ERR(pdev)) {
-		ret = PTR_ERR(pdev);
-		goto out_driver;
-	}
-
-	/* create register cache */
-	ret = applesmc_init_smcreg();
+	ret = applesmc_create_nodes(smc, info_group, 1);
 	if (ret)
-		goto out_device;
-
-	ret = applesmc_create_nodes(info_group, 1);
+		goto out;
+	ret = applesmc_create_nodes(smc, BCLM_group, 1);
 	if (ret)
-		goto out_smcreg;
+		goto out_info;
 
-	ret = applesmc_create_nodes(fan_group, smcreg.fan_count);
+	ret = applesmc_create_nodes(smc, fan_group, smc->reg.fan_count);
 	if (ret)
-		goto out_info;
+		goto out_bclm;
 
-	ret = applesmc_create_nodes(temp_group, smcreg.index_count);
+	ret = applesmc_create_nodes(smc, temp_group, smc->reg.index_count);
 	if (ret)
 		goto out_fans;
 
-	ret = applesmc_create_accelerometer();
+	ret = applesmc_create_accelerometer(smc);
 	if (ret)
 		goto out_temperature;
 
-	ret = applesmc_create_light_sensor();
+	ret = applesmc_create_light_sensor(smc);
 	if (ret)
 		goto out_accelerometer;
 
-	ret = applesmc_create_key_backlight();
+	ret = applesmc_create_key_backlight(smc);
 	if (ret)
 		goto out_light_sysfs;
 
-	hwmon_dev = hwmon_device_register(&pdev->dev);
-	if (IS_ERR(hwmon_dev)) {
-		ret = PTR_ERR(hwmon_dev);
+	smc->hwmon_dev = hwmon_device_register(&smc->dev->dev);
+	if (IS_ERR(smc->hwmon_dev)) {
+		ret = PTR_ERR(smc->hwmon_dev);
 		goto out_light_ledclass;
 	}
 
 	return 0;
 
 out_light_ledclass:
-	applesmc_release_key_backlight();
+	applesmc_release_key_backlight(smc);
 out_light_sysfs:
-	applesmc_release_light_sensor();
+	applesmc_release_light_sensor(smc);
 out_accelerometer:
-	applesmc_release_accelerometer();
+	applesmc_release_accelerometer(smc);
 out_temperature:
-	applesmc_destroy_nodes(temp_group);
+	applesmc_destroy_nodes(smc, temp_group);
 out_fans:
-	applesmc_destroy_nodes(fan_group);
+	applesmc_destroy_nodes(smc, fan_group);
+out_bclm:
+	applesmc_destroy_nodes(smc, BCLM_group);
 out_info:
-	applesmc_destroy_nodes(info_group);
-out_smcreg:
-	applesmc_destroy_smcreg();
-out_device:
-	platform_device_unregister(pdev);
-out_driver:
-	platform_driver_unregister(&applesmc_driver);
-out_region:
-	release_region(APPLESMC_DATA_PORT, APPLESMC_NR_PORTS);
+	applesmc_destroy_nodes(smc, info_group);
+out:
+	return ret;
+}
+
+static void applesmc_destroy_modules(struct applesmc_device *smc)
+{
+	hwmon_device_unregister(smc->hwmon_dev);
+	applesmc_release_key_backlight(smc);
+	applesmc_release_light_sensor(smc);
+	applesmc_release_accelerometer(smc);
+	applesmc_destroy_nodes(smc, temp_group);
+	applesmc_destroy_nodes(smc, fan_group);
+	applesmc_destroy_nodes(smc, BCLM_group);
+	applesmc_destroy_nodes(smc, info_group);
+}
+
+static int __init applesmc_init(void)
+{
+	int ret;
+
+	if (!dmi_check_system(applesmc_whitelist)) {
+		pr_warn("supported laptop not found!\n");
+		ret = -ENODEV;
+		goto out;
+	}
+
+	ret = acpi_bus_register_driver(&applesmc_driver);
+	if (ret)
+		goto out;
+
+	return 0;
+
 out:
 	pr_warn("driver init failed (ret=%d)!\n", ret);
 	return ret;
@@ -1397,23 +1948,14 @@ static int __init applesmc_init(void)
 
 static void __exit applesmc_exit(void)
 {
-	hwmon_device_unregister(hwmon_dev);
-	applesmc_release_key_backlight();
-	applesmc_release_light_sensor();
-	applesmc_release_accelerometer();
-	applesmc_destroy_nodes(temp_group);
-	applesmc_destroy_nodes(fan_group);
-	applesmc_destroy_nodes(info_group);
-	applesmc_destroy_smcreg();
-	platform_device_unregister(pdev);
-	platform_driver_unregister(&applesmc_driver);
-	release_region(APPLESMC_DATA_PORT, APPLESMC_NR_PORTS);
+	acpi_bus_unregister_driver(&applesmc_driver);
 }
 
 module_init(applesmc_init);
 module_exit(applesmc_exit);
 
 MODULE_AUTHOR("Nicolas Boichat");
+MODULE_AUTHOR("Paul Pawlowski");
 MODULE_DESCRIPTION("Apple SMC");
 MODULE_LICENSE("GPL v2");
 MODULE_DEVICE_TABLE(dmi, applesmc_whitelist);
diff --git a/drivers/input/mouse/bcm5974.c b/drivers/input/mouse/bcm5974.c
index ca150618d32f..4e692b272ae9 100644
--- a/drivers/input/mouse/bcm5974.c
+++ b/drivers/input/mouse/bcm5974.c
@@ -83,6 +83,24 @@
 #define USB_DEVICE_ID_APPLE_WELLSPRING9_ISO	0x0273
 #define USB_DEVICE_ID_APPLE_WELLSPRING9_JIS	0x0274
 
+/* T2-Attached Devices */
+/* MacbookAir8,1 (2018) */
+#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J140K	0x027a
+/* MacbookPro15,2 (2018) */
+#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J132	0x027b
+/* MacbookPro15,1 (2018) */
+#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J680	0x027c
+/* MacbookPro15,4 (2019) */
+#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J213	0x027d
+/* MacbookPro16,2 (2020) */
+#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J214K	0x027e
+/* MacbookPro16,3 (2020) */
+#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J223	0x027f
+/* MacbookAir9,1 (2020) */
+#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J230K	0x0280
+/* MacbookPro16,1 (2019)*/
+#define USB_DEVICE_ID_APPLE_WELLSPRINGT2_J152F	0x0340
+
 #define BCM5974_DEVICE(prod) {					\
 	.match_flags = (USB_DEVICE_ID_MATCH_DEVICE |		\
 			USB_DEVICE_ID_MATCH_INT_CLASS |		\
@@ -147,6 +165,22 @@ static const struct usb_device_id bcm5974_table[] = {
 	BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING9_ANSI),
 	BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING9_ISO),
 	BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING9_JIS),
+	/* MacbookAir8,1 */
+	BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRINGT2_J140K),
+	/* MacbookPro15,2 */
+	BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRINGT2_J132),
+	/* MacbookPro15,1 */
+	BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRINGT2_J680),
+	/* MacbookPro15,4 */
+	BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRINGT2_J213),
+	/* MacbookPro16,2 */
+	BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRINGT2_J214K),
+	/* MacbookPro16,3 */
+	BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRINGT2_J223),
+	/* MacbookAir9,1 */
+	BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRINGT2_J230K),
+	/* MacbookPro16,1 */
+	BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRINGT2_J152F),
 	/* Terminating entry */
 	{}
 };
@@ -483,6 +517,110 @@ static const struct bcm5974_config bcm5974_config_table[] = {
 		{ SN_COORD, -203, 6803 },
 		{ SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
 	},
+	{
+		USB_DEVICE_ID_APPLE_WELLSPRINGT2_J140K,
+		0,
+		0,
+		HAS_INTEGRATED_BUTTON,
+		0, sizeof(struct bt_data),
+		0x83, DATAFORMAT(TYPE4),
+		{ SN_PRESSURE, 0, 300 },
+		{ SN_WIDTH, 0, 2048 },
+		{ SN_COORD, -6243, 6749 },
+		{ SN_COORD, -170, 7685 },
+		{ SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
+	},
+	{
+		USB_DEVICE_ID_APPLE_WELLSPRINGT2_J132,
+		0,
+		0,
+		HAS_INTEGRATED_BUTTON,
+		0, sizeof(struct bt_data),
+		0x83, DATAFORMAT(TYPE4),
+		{ SN_PRESSURE, 0, 300 },
+		{ SN_WIDTH, 0, 2048 },
+		{ SN_COORD, -6243, 6749 },
+		{ SN_COORD, -170, 7685 },
+		{ SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
+	},
+	{
+		USB_DEVICE_ID_APPLE_WELLSPRINGT2_J680,
+		0,
+		0,
+		HAS_INTEGRATED_BUTTON,
+		0, sizeof(struct bt_data),
+		0x83, DATAFORMAT(TYPE4),
+		{ SN_PRESSURE, 0, 300 },
+		{ SN_WIDTH, 0, 2048 },
+		{ SN_COORD, -7456, 7976 },
+		{ SN_COORD, -1768, 7685 },
+		{ SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
+	},
+	{
+		USB_DEVICE_ID_APPLE_WELLSPRINGT2_J213,
+		0,
+		0,
+		HAS_INTEGRATED_BUTTON,
+		0, sizeof(struct bt_data),
+		0x83, DATAFORMAT(TYPE4),
+		{ SN_PRESSURE, 0, 300 },
+		{ SN_WIDTH, 0, 2048 },
+		{ SN_COORD, -6243, 6749 },
+		{ SN_COORD, -170, 7685 },
+		{ SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
+	},
+	{
+		USB_DEVICE_ID_APPLE_WELLSPRINGT2_J214K,
+		0,
+		0,
+		HAS_INTEGRATED_BUTTON,
+		0, sizeof(struct bt_data),
+		0x83, DATAFORMAT(TYPE4),
+		{ SN_PRESSURE, 0, 300 },
+		{ SN_WIDTH, 0, 2048 },
+		{ SN_COORD, -7823, 8329 },
+		{ SN_COORD, -370, 7925 },
+		{ SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
+	},
+	{
+		USB_DEVICE_ID_APPLE_WELLSPRINGT2_J223,
+		0,
+		0,
+		HAS_INTEGRATED_BUTTON,
+		0, sizeof(struct bt_data),
+		0x83, DATAFORMAT(TYPE4),
+		{ SN_PRESSURE, 0, 300 },
+		{ SN_WIDTH, 0, 2048 },
+		{ SN_COORD, -6243, 6749 },
+		{ SN_COORD, -170, 7685 },
+		{ SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
+	},
+	{
+		USB_DEVICE_ID_APPLE_WELLSPRINGT2_J230K,
+		0,
+		0,
+		HAS_INTEGRATED_BUTTON,
+		0, sizeof(struct bt_data),
+		0x83, DATAFORMAT(TYPE4),
+		{ SN_PRESSURE, 0, 300 },
+		{ SN_WIDTH, 0, 2048 },
+		{ SN_COORD, -6243, 6749 },
+		{ SN_COORD, -170, 7685 },
+		{ SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
+	},
+	{
+		USB_DEVICE_ID_APPLE_WELLSPRINGT2_J152F,
+		0,
+		0,
+		HAS_INTEGRATED_BUTTON,
+		0, sizeof(struct bt_data),
+		0x83, DATAFORMAT(TYPE4),
+		{ SN_PRESSURE, 0, 300 },
+		{ SN_WIDTH, 0, 2048 },
+		{ SN_COORD, -8916, 9918 },
+		{ SN_COORD, -1934, 9835 },
+		{ SN_ORIENT, -MAX_FINGER_ORIENTATION, MAX_FINGER_ORIENTATION }
+	},
 	{}
 };
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 826b768196e2..5fe0e671ecb3 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -4320,16 +4320,9 @@ brcmf_pmksa_v3_op(struct brcmf_if *ifp, struct cfg80211_pmksa *pmksa,
 		/* Single PMK operation */
 		pmk_op->count = cpu_to_le16(1);
 		length += sizeof(struct brcmf_pmksa_v3);
-		if (pmksa->bssid)
-			memcpy(pmk_op->pmk[0].bssid, pmksa->bssid, ETH_ALEN);
-		if (pmksa->pmkid) {
-			memcpy(pmk_op->pmk[0].pmkid, pmksa->pmkid, WLAN_PMKID_LEN);
-			pmk_op->pmk[0].pmkid_len = WLAN_PMKID_LEN;
-		}
-		if (pmksa->ssid && pmksa->ssid_len) {
-			memcpy(pmk_op->pmk[0].ssid.SSID, pmksa->ssid, pmksa->ssid_len);
-			pmk_op->pmk[0].ssid.SSID_len = pmksa->ssid_len;
-		}
+		memcpy(pmk_op->pmk[0].bssid, pmksa->bssid, ETH_ALEN);
+		memcpy(pmk_op->pmk[0].pmkid, pmksa->pmkid, WLAN_PMKID_LEN);
+		pmk_op->pmk[0].pmkid_len = WLAN_PMKID_LEN;
 		pmk_op->pmk[0].time_left = cpu_to_le32(alive ? BRCMF_PMKSA_NO_EXPIRY : 0);
 	}
 
diff --git a/drivers/pci/vgaarb.c b/drivers/pci/vgaarb.c
index 78748e8d2dba..2b2b558cebe6 100644
--- a/drivers/pci/vgaarb.c
+++ b/drivers/pci/vgaarb.c
@@ -143,6 +143,7 @@ void vga_set_default_device(struct pci_dev *pdev)
 	pci_dev_put(vga_default);
 	vga_default = pci_dev_get(pdev);
 }
+EXPORT_SYMBOL_GPL(vga_set_default_device);
 
 /**
  * vga_remove_vgacon - deactivate VGA console
diff --git a/drivers/platform/x86/apple-gmux.c b/drivers/platform/x86/apple-gmux.c
index 1417e230edbd..e69785af8e1d 100644
--- a/drivers/platform/x86/apple-gmux.c
+++ b/drivers/platform/x86/apple-gmux.c
@@ -21,6 +21,7 @@
 #include <linux/delay.h>
 #include <linux/pci.h>
 #include <linux/vga_switcheroo.h>
+#include <linux/vgaarb.h>
 #include <linux/debugfs.h>
 #include <acpi/video.h>
 #include <asm/io.h>
@@ -107,6 +108,10 @@ struct apple_gmux_config {
 
 # define MMIO_GMUX_MAX_BRIGHTNESS	0xffff
 
+static bool force_igd;
+module_param(force_igd, bool, 0);
+MODULE_PARM_DESC(force_idg, "Switch gpu to igd on module load. Make sure that you have apple-set-os set up and the iGPU is in `lspci -s 00:02.0`. (default: false) (bool)");
+
 static u8 gmux_pio_read8(struct apple_gmux_data *gmux_data, int port)
 {
 	return inb(gmux_data->iostart + port);
@@ -945,6 +950,19 @@ static int gmux_probe(struct pnp_dev *pnp, const struct pnp_device_id *id)
 	gmux_enable_interrupts(gmux_data);
 	gmux_read_switch_state(gmux_data);
 
+	if (force_igd) {
+		struct pci_dev *pdev;
+
+		pdev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(2, 0));
+		if (pdev) {
+			pr_info("Switching to IGD");
+			gmux_switchto(VGA_SWITCHEROO_IGD);
+			vga_set_default_device(pdev);
+		} else {
+			pr_err("force_idg is true, but couldn't find iGPU at 00:02.0! Is apple-set-os working?");
+		}
+	}
+
 	/*
 	 * Retina MacBook Pros cannot switch the panel's AUX separately
 	 * and need eDP pre-calibration. They are distinguishable from
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index db4a392841b1..580df4ce4f9f 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -66,4 +66,6 @@ source "drivers/staging/fieldbus/Kconfig"
 
 source "drivers/staging/vme_user/Kconfig"
 
+source "drivers/staging/apple-bce/Kconfig"
+
 endif # STAGING
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index 5390879b5d1b..528be2d3b546 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -22,3 +22,4 @@ obj-$(CONFIG_GREYBUS)		+= greybus/
 obj-$(CONFIG_BCM2835_VCHIQ)	+= vc04_services/
 obj-$(CONFIG_XIL_AXIS_FIFO)	+= axis-fifo/
 obj-$(CONFIG_FIELDBUS_DEV)     += fieldbus/
+obj-$(CONFIG_APPLE_BCE)		+= apple-bce/
diff --git a/drivers/staging/apple-bce/Kconfig b/drivers/staging/apple-bce/Kconfig
new file mode 100644
index 000000000000..fe92bc441e89
--- /dev/null
+++ b/drivers/staging/apple-bce/Kconfig
@@ -0,0 +1,18 @@
+config APPLE_BCE
+	tristate "Apple BCE driver (VHCI and Audio support)"
+	default m
+	depends on X86
+	select SOUND
+	select SND
+	select SND_PCM
+	select SND_JACK
+	help
+	  VHCI and audio support on Apple MacBooks with the T2 Chip.
+	  This driver is divided in three components:
+	    - BCE (Buffer Copy Engine): which establishes a basic communication
+	      channel with the T2 chip. This component is required by the other two:
+	      - VHCI (Virtual Host Controller Interface): Access to keyboard, mouse
+	        and other system devices depend on this virtual USB host controller
+	      - Audio: a driver for the T2 audio interface.
+	 
+	  If "M" is selected, the module will be called apple-bce.'
diff --git a/drivers/staging/apple-bce/Makefile b/drivers/staging/apple-bce/Makefile
new file mode 100644
index 000000000000..8cfbd3f64af6
--- /dev/null
+++ b/drivers/staging/apple-bce/Makefile
@@ -0,0 +1,28 @@
+modname := apple-bce
+obj-$(CONFIG_APPLE_BCE) += $(modname).o
+
+apple-bce-objs := apple_bce.o mailbox.o queue.o queue_dma.o vhci/vhci.o vhci/queue.o vhci/transfer.o audio/audio.o audio/protocol.o audio/protocol_bce.o audio/pcm.o
+
+MY_CFLAGS += -DWITHOUT_NVME_PATCH
+#MY_CFLAGS += -g -DDEBUG
+ccflags-y += ${MY_CFLAGS}
+CC += ${MY_CFLAGS}
+
+KVERSION := $(KERNELRELEASE)
+ifeq ($(origin KERNELRELEASE), undefined)
+KVERSION := $(shell uname -r)
+endif
+
+KDIR := /lib/modules/$(KVERSION)/build
+PWD := $(shell pwd)
+
+.PHONY: all
+
+all:
+	$(MAKE) -C $(KDIR) M=$(PWD) modules
+
+clean:
+	$(MAKE) -C $(KDIR) M=$(PWD) clean
+
+install:
+	$(MAKE) -C $(KDIR) M=$(PWD) modules_install
diff --git a/drivers/staging/apple-bce/apple_bce.c b/drivers/staging/apple-bce/apple_bce.c
new file mode 100644
index 000000000000..4fd2415d7028
--- /dev/null
+++ b/drivers/staging/apple-bce/apple_bce.c
@@ -0,0 +1,445 @@
+#include "apple_bce.h"
+#include <linux/module.h>
+#include <linux/crc32.h>
+#include "audio/audio.h"
+#include <linux/version.h>
+
+static dev_t bce_chrdev;
+static struct class *bce_class;
+
+struct apple_bce_device *global_bce;
+
+static int bce_create_command_queues(struct apple_bce_device *bce);
+static void bce_free_command_queues(struct apple_bce_device *bce);
+static irqreturn_t bce_handle_mb_irq(int irq, void *dev);
+static irqreturn_t bce_handle_dma_irq(int irq, void *dev);
+static int bce_fw_version_handshake(struct apple_bce_device *bce);
+static int bce_register_command_queue(struct apple_bce_device *bce, struct bce_queue_memcfg *cfg, int is_sq);
+
+static int apple_bce_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+    struct apple_bce_device *bce = NULL;
+    int status = 0;
+    int nvec;
+
+    pr_info("apple-bce: capturing our device\n");
+
+    if (pci_enable_device(dev))
+        return -ENODEV;
+    if (pci_request_regions(dev, "apple-bce")) {
+        status = -ENODEV;
+        goto fail;
+    }
+    pci_set_master(dev);
+    nvec = pci_alloc_irq_vectors(dev, 1, 8, PCI_IRQ_MSI);
+    if (nvec < 5) {
+        status = -EINVAL;
+        goto fail;
+    }
+
+    bce = kzalloc(sizeof(struct apple_bce_device), GFP_KERNEL);
+    if (!bce) {
+        status = -ENOMEM;
+        goto fail;
+    }
+
+    bce->pci = dev;
+    pci_set_drvdata(dev, bce);
+
+    bce->devt = bce_chrdev;
+    bce->dev = device_create(bce_class, &dev->dev, bce->devt, NULL, "apple-bce");
+    if (IS_ERR_OR_NULL(bce->dev)) {
+        status = PTR_ERR(bce_class);
+        goto fail;
+    }
+
+    bce->reg_mem_mb = pci_iomap(dev, 4, 0);
+    bce->reg_mem_dma = pci_iomap(dev, 2, 0);
+
+    if (IS_ERR_OR_NULL(bce->reg_mem_mb) || IS_ERR_OR_NULL(bce->reg_mem_dma)) {
+        dev_warn(&dev->dev, "apple-bce: Failed to pci_iomap required regions\n");
+        goto fail;
+    }
+
+    bce_mailbox_init(&bce->mbox, bce->reg_mem_mb);
+    bce_timestamp_init(&bce->timestamp, bce->reg_mem_mb);
+
+    spin_lock_init(&bce->queues_lock);
+    ida_init(&bce->queue_ida);
+
+    if ((status = pci_request_irq(dev, 0, bce_handle_mb_irq, NULL, dev, "bce_mbox")))
+        goto fail;
+    if ((status = pci_request_irq(dev, 4, NULL, bce_handle_dma_irq, dev, "bce_dma")))
+        goto fail_interrupt_0;
+
+    if ((status = dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(37)))) {
+        dev_warn(&dev->dev, "dma: Setting mask failed\n");
+        goto fail_interrupt;
+    }
+
+    /* Gets the function 0's interface. This is needed because Apple only accepts DMA on our function if function 0
+       is a bus master, so we need to work around this. */
+    bce->pci0 = pci_get_slot(dev->bus, PCI_DEVFN(PCI_SLOT(dev->devfn), 0));
+#ifndef WITHOUT_NVME_PATCH
+    if ((status = pci_enable_device_mem(bce->pci0))) {
+        dev_warn(&dev->dev, "apple-bce: failed to enable function 0\n");
+        goto fail_dev0;
+    }
+#endif
+    pci_set_master(bce->pci0);
+
+    bce_timestamp_start(&bce->timestamp, true);
+
+    if ((status = bce_fw_version_handshake(bce)))
+        goto fail_ts;
+    pr_info("apple-bce: handshake done\n");
+
+    if ((status = bce_create_command_queues(bce))) {
+        pr_info("apple-bce: Creating command queues failed\n");
+        goto fail_ts;
+    }
+
+    global_bce = bce;
+
+    bce_vhci_create(bce, &bce->vhci);
+
+    return 0;
+
+fail_ts:
+    bce_timestamp_stop(&bce->timestamp);
+#ifndef WITHOUT_NVME_PATCH
+    pci_disable_device(bce->pci0);
+fail_dev0:
+#endif
+    pci_dev_put(bce->pci0);
+fail_interrupt:
+    pci_free_irq(dev, 4, dev);
+fail_interrupt_0:
+    pci_free_irq(dev, 0, dev);
+fail:
+    if (bce && bce->dev) {
+        device_destroy(bce_class, bce->devt);
+
+        if (!IS_ERR_OR_NULL(bce->reg_mem_mb))
+            pci_iounmap(dev, bce->reg_mem_mb);
+        if (!IS_ERR_OR_NULL(bce->reg_mem_dma))
+            pci_iounmap(dev, bce->reg_mem_dma);
+
+        kfree(bce);
+    }
+
+    pci_free_irq_vectors(dev);
+    pci_release_regions(dev);
+    pci_disable_device(dev);
+
+    if (!status)
+        status = -EINVAL;
+    return status;
+}
+
+static int bce_create_command_queues(struct apple_bce_device *bce)
+{
+    int status;
+    struct bce_queue_memcfg *cfg;
+
+    bce->cmd_cq = bce_alloc_cq(bce, 0, 0x20);
+    bce->cmd_cmdq = bce_alloc_cmdq(bce, 1, 0x20);
+    if (bce->cmd_cq == NULL || bce->cmd_cmdq == NULL) {
+        status = -ENOMEM;
+        goto err;
+    }
+    bce->queues[0] = (struct bce_queue *) bce->cmd_cq;
+    bce->queues[1] = (struct bce_queue *) bce->cmd_cmdq->sq;
+
+    cfg = kzalloc(sizeof(struct bce_queue_memcfg), GFP_KERNEL);
+    if (!cfg) {
+        status = -ENOMEM;
+        goto err;
+    }
+    bce_get_cq_memcfg(bce->cmd_cq, cfg);
+    if ((status = bce_register_command_queue(bce, cfg, false)))
+        goto err;
+    bce_get_sq_memcfg(bce->cmd_cmdq->sq, bce->cmd_cq, cfg);
+    if ((status = bce_register_command_queue(bce, cfg, true)))
+        goto err;
+    kfree(cfg);
+
+    return 0;
+
+err:
+    if (bce->cmd_cq)
+        bce_free_cq(bce, bce->cmd_cq);
+    if (bce->cmd_cmdq)
+        bce_free_cmdq(bce, bce->cmd_cmdq);
+    return status;
+}
+
+static void bce_free_command_queues(struct apple_bce_device *bce)
+{
+    bce_free_cq(bce, bce->cmd_cq);
+    bce_free_cmdq(bce, bce->cmd_cmdq);
+    bce->cmd_cq = NULL;
+    bce->queues[0] = NULL;
+}
+
+static irqreturn_t bce_handle_mb_irq(int irq, void *dev)
+{
+    struct apple_bce_device *bce = pci_get_drvdata(dev);
+    bce_mailbox_handle_interrupt(&bce->mbox);
+    return IRQ_HANDLED;
+}
+
+static irqreturn_t bce_handle_dma_irq(int irq, void *dev)
+{
+    int i;
+    struct apple_bce_device *bce = pci_get_drvdata(dev);
+    spin_lock(&bce->queues_lock);
+    for (i = 0; i < BCE_MAX_QUEUE_COUNT; i++)
+        if (bce->queues[i] && bce->queues[i]->type == BCE_QUEUE_CQ)
+            bce_handle_cq_completions(bce, (struct bce_queue_cq *) bce->queues[i]);
+    spin_unlock(&bce->queues_lock);
+    return IRQ_HANDLED;
+}
+
+static int bce_fw_version_handshake(struct apple_bce_device *bce)
+{
+    u64 result;
+    int status;
+
+    if ((status = bce_mailbox_send(&bce->mbox, BCE_MB_MSG(BCE_MB_SET_FW_PROTOCOL_VERSION, BC_PROTOCOL_VERSION),
+            &result)))
+        return status;
+    if (BCE_MB_TYPE(result) != BCE_MB_SET_FW_PROTOCOL_VERSION ||
+        BCE_MB_VALUE(result) != BC_PROTOCOL_VERSION) {
+        pr_err("apple-bce: FW version handshake failed %x:%llx\n", BCE_MB_TYPE(result), BCE_MB_VALUE(result));
+        return -EINVAL;
+    }
+    return 0;
+}
+
+static int bce_register_command_queue(struct apple_bce_device *bce, struct bce_queue_memcfg *cfg, int is_sq)
+{
+    int status;
+    int cmd_type;
+    u64 result;
+    // OS X uses an bidirectional direction, but that's not really needed
+    dma_addr_t a = dma_map_single(&bce->pci->dev, cfg, sizeof(struct bce_queue_memcfg), DMA_TO_DEVICE);
+    if (dma_mapping_error(&bce->pci->dev, a))
+        return -ENOMEM;
+    cmd_type = is_sq ? BCE_MB_REGISTER_COMMAND_SQ : BCE_MB_REGISTER_COMMAND_CQ;
+    status = bce_mailbox_send(&bce->mbox, BCE_MB_MSG(cmd_type, a), &result);
+    dma_unmap_single(&bce->pci->dev, a, sizeof(struct bce_queue_memcfg), DMA_TO_DEVICE);
+    if (status)
+        return status;
+    if (BCE_MB_TYPE(result) != BCE_MB_REGISTER_COMMAND_QUEUE_REPLY)
+        return -EINVAL;
+    return 0;
+}
+
+static void apple_bce_remove(struct pci_dev *dev)
+{
+    struct apple_bce_device *bce = pci_get_drvdata(dev);
+    bce->is_being_removed = true;
+
+    bce_vhci_destroy(&bce->vhci);
+
+    bce_timestamp_stop(&bce->timestamp);
+#ifndef WITHOUT_NVME_PATCH
+    pci_disable_device(bce->pci0);
+#endif
+    pci_dev_put(bce->pci0);
+    pci_free_irq(dev, 0, dev);
+    pci_free_irq(dev, 4, dev);
+    bce_free_command_queues(bce);
+    pci_iounmap(dev, bce->reg_mem_mb);
+    pci_iounmap(dev, bce->reg_mem_dma);
+    device_destroy(bce_class, bce->devt);
+    pci_free_irq_vectors(dev);
+    pci_release_regions(dev);
+    pci_disable_device(dev);
+    kfree(bce);
+}
+
+static int bce_save_state_and_sleep(struct apple_bce_device *bce)
+{
+    int attempt, status = 0;
+    u64 resp;
+    dma_addr_t dma_addr;
+    void *dma_ptr = NULL;
+    size_t size = max(PAGE_SIZE, 4096UL);
+
+    for (attempt = 0; attempt < 5; ++attempt) {
+        pr_debug("apple-bce: suspend: attempt %i, buffer size %li\n", attempt, size);
+        dma_ptr = dma_alloc_coherent(&bce->pci->dev, size, &dma_addr, GFP_KERNEL);
+        if (!dma_ptr) {
+            pr_err("apple-bce: suspend failed (data alloc failed)\n");
+            break;
+        }
+        BUG_ON((dma_addr % 4096) != 0);
+        status = bce_mailbox_send(&bce->mbox,
+                BCE_MB_MSG(BCE_MB_SAVE_STATE_AND_SLEEP, (dma_addr & ~(4096LLU - 1)) | (size / 4096)), &resp);
+        if (status) {
+            pr_err("apple-bce: suspend failed (mailbox send)\n");
+            break;
+        }
+        if (BCE_MB_TYPE(resp) == BCE_MB_SAVE_RESTORE_STATE_COMPLETE) {
+            bce->saved_data_dma_addr = dma_addr;
+            bce->saved_data_dma_ptr = dma_ptr;
+            bce->saved_data_dma_size = size;
+            return 0;
+        } else if (BCE_MB_TYPE(resp) == BCE_MB_SAVE_STATE_AND_SLEEP_FAILURE) {
+            dma_free_coherent(&bce->pci->dev, size, dma_ptr, dma_addr);
+            /* The 0x10ff magic value was extracted from Apple's driver */
+            size = (BCE_MB_VALUE(resp) + 0x10ff) & ~(4096LLU - 1);
+            pr_debug("apple-bce: suspend: device requested a larger buffer (%li)\n", size);
+            continue;
+        } else {
+            pr_err("apple-bce: suspend failed (invalid device response)\n");
+            status = -EINVAL;
+            break;
+        }
+    }
+    if (dma_ptr)
+        dma_free_coherent(&bce->pci->dev, size, dma_ptr, dma_addr);
+    if (!status)
+        return bce_mailbox_send(&bce->mbox, BCE_MB_MSG(BCE_MB_SLEEP_NO_STATE, 0), &resp);
+    return status;
+}
+
+static int bce_restore_state_and_wake(struct apple_bce_device *bce)
+{
+    int status;
+    u64 resp;
+    if (!bce->saved_data_dma_ptr) {
+        if ((status = bce_mailbox_send(&bce->mbox, BCE_MB_MSG(BCE_MB_RESTORE_NO_STATE, 0), &resp))) {
+            pr_err("apple-bce: resume with no state failed (mailbox send)\n");
+            return status;
+        }
+        if (BCE_MB_TYPE(resp) != BCE_MB_RESTORE_NO_STATE) {
+            pr_err("apple-bce: resume with no state failed (invalid device response)\n");
+            return -EINVAL;
+        }
+        return 0;
+    }
+
+    if ((status = bce_mailbox_send(&bce->mbox, BCE_MB_MSG(BCE_MB_RESTORE_STATE_AND_WAKE,
+            (bce->saved_data_dma_addr & ~(4096LLU - 1)) | (bce->saved_data_dma_size / 4096)), &resp))) {
+        pr_err("apple-bce: resume with state failed (mailbox send)\n");
+        goto finish_with_state;
+    }
+    if (BCE_MB_TYPE(resp) != BCE_MB_SAVE_RESTORE_STATE_COMPLETE) {
+        pr_err("apple-bce: resume with state failed (invalid device response)\n");
+        status = -EINVAL;
+        goto finish_with_state;
+    }
+
+finish_with_state:
+    dma_free_coherent(&bce->pci->dev, bce->saved_data_dma_size, bce->saved_data_dma_ptr, bce->saved_data_dma_addr);
+    bce->saved_data_dma_ptr = NULL;
+    return status;
+}
+
+static int apple_bce_suspend(struct device *dev)
+{
+    struct apple_bce_device *bce = pci_get_drvdata(to_pci_dev(dev));
+    int status;
+
+    bce_timestamp_stop(&bce->timestamp);
+
+    if ((status = bce_save_state_and_sleep(bce)))
+        return status;
+
+    return 0;
+}
+
+static int apple_bce_resume(struct device *dev)
+{
+    struct apple_bce_device *bce = pci_get_drvdata(to_pci_dev(dev));
+    int status;
+
+    pci_set_master(bce->pci);
+    pci_set_master(bce->pci0);
+
+    if ((status = bce_restore_state_and_wake(bce)))
+        return status;
+
+    bce_timestamp_start(&bce->timestamp, false);
+
+    return 0;
+}
+
+static struct pci_device_id apple_bce_ids[  ] = {
+        { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x1801) },
+        { 0, },
+};
+
+MODULE_DEVICE_TABLE(pci, apple_bce_ids);
+
+struct dev_pm_ops apple_bce_pci_driver_pm = {
+        .suspend = apple_bce_suspend,
+        .resume = apple_bce_resume
+};
+struct pci_driver apple_bce_pci_driver = {
+        .name = "apple-bce",
+        .id_table = apple_bce_ids,
+        .probe = apple_bce_probe,
+        .remove = apple_bce_remove,
+        .driver = {
+                .pm = &apple_bce_pci_driver_pm
+        }
+};
+
+
+static int __init apple_bce_module_init(void)
+{
+    int result;
+    if ((result = alloc_chrdev_region(&bce_chrdev, 0, 1, "apple-bce")))
+        goto fail_chrdev;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6,4,0)
+    bce_class = class_create(THIS_MODULE, "apple-bce");
+#else
+    bce_class = class_create("apple-bce");
+#endif
+    if (IS_ERR(bce_class)) {
+        result = PTR_ERR(bce_class);
+        goto fail_class;
+    }
+    if ((result = bce_vhci_module_init())) {
+        pr_err("apple-bce: bce-vhci init failed");
+        goto fail_class;
+    }
+
+    result = pci_register_driver(&apple_bce_pci_driver);
+    if (result)
+        goto fail_drv;
+
+    aaudio_module_init();
+
+    return 0;
+
+fail_drv:
+    pci_unregister_driver(&apple_bce_pci_driver);
+fail_class:
+    class_destroy(bce_class);
+fail_chrdev:
+    unregister_chrdev_region(bce_chrdev, 1);
+    if (!result)
+        result = -EINVAL;
+    return result;
+}
+static void __exit apple_bce_module_exit(void)
+{
+    pci_unregister_driver(&apple_bce_pci_driver);
+
+    aaudio_module_exit();
+    bce_vhci_module_exit();
+    class_destroy(bce_class);
+    unregister_chrdev_region(bce_chrdev, 1);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("MrARM");
+MODULE_DESCRIPTION("Apple BCE Driver");
+MODULE_VERSION("0.01");
+module_init(apple_bce_module_init);
+module_exit(apple_bce_module_exit);
diff --git a/drivers/staging/apple-bce/apple_bce.h b/drivers/staging/apple-bce/apple_bce.h
new file mode 100644
index 000000000000..f13ab8d5742e
--- /dev/null
+++ b/drivers/staging/apple-bce/apple_bce.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include "mailbox.h"
+#include "queue.h"
+#include "vhci/vhci.h"
+
+#define BC_PROTOCOL_VERSION 0x20001
+#define BCE_MAX_QUEUE_COUNT 0x100
+
+#define BCE_QUEUE_USER_MIN 2
+#define BCE_QUEUE_USER_MAX (BCE_MAX_QUEUE_COUNT - 1)
+
+struct apple_bce_device {
+    struct pci_dev *pci, *pci0;
+    dev_t devt;
+    struct device *dev;
+    void __iomem *reg_mem_mb;
+    void __iomem *reg_mem_dma;
+    struct bce_mailbox mbox;
+    struct bce_timestamp timestamp;
+    struct bce_queue *queues[BCE_MAX_QUEUE_COUNT];
+    struct spinlock queues_lock;
+    struct ida queue_ida;
+    struct bce_queue_cq *cmd_cq;
+    struct bce_queue_cmdq *cmd_cmdq;
+    struct bce_queue_sq *int_sq_list[BCE_MAX_QUEUE_COUNT];
+    bool is_being_removed;
+
+    dma_addr_t saved_data_dma_addr;
+    void *saved_data_dma_ptr;
+    size_t saved_data_dma_size;
+
+    struct bce_vhci vhci;
+};
+
+extern struct apple_bce_device *global_bce;
\ No newline at end of file
diff --git a/drivers/staging/apple-bce/audio/audio.c b/drivers/staging/apple-bce/audio/audio.c
new file mode 100644
index 000000000000..bd16ddd16c1d
--- /dev/null
+++ b/drivers/staging/apple-bce/audio/audio.c
@@ -0,0 +1,711 @@
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <sound/core.h>
+#include <sound/initval.h>
+#include <sound/pcm.h>
+#include <sound/jack.h>
+#include "audio.h"
+#include "pcm.h"
+#include <linux/version.h>
+
+static int aaudio_alsa_index = SNDRV_DEFAULT_IDX1;
+static char *aaudio_alsa_id = SNDRV_DEFAULT_STR1;
+
+static dev_t aaudio_chrdev;
+static struct class *aaudio_class;
+
+static int aaudio_init_cmd(struct aaudio_device *a);
+static int aaudio_init_bs(struct aaudio_device *a);
+static void aaudio_init_dev(struct aaudio_device *a, aaudio_device_id_t dev_id);
+static void aaudio_free_dev(struct aaudio_subdevice *sdev);
+
+static int aaudio_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+    struct aaudio_device *aaudio = NULL;
+    struct aaudio_subdevice *sdev = NULL;
+    int status = 0;
+    u32 cfg;
+
+    pr_info("aaudio: capturing our device\n");
+
+    if (pci_enable_device(dev))
+        return -ENODEV;
+    if (pci_request_regions(dev, "aaudio")) {
+        status = -ENODEV;
+        goto fail;
+    }
+    pci_set_master(dev);
+
+    aaudio = kzalloc(sizeof(struct aaudio_device), GFP_KERNEL);
+    if (!aaudio) {
+        status = -ENOMEM;
+        goto fail;
+    }
+
+    aaudio->bce = global_bce;
+    if (!aaudio->bce) {
+        dev_warn(&dev->dev, "aaudio: No BCE available\n");
+        status = -EINVAL;
+        goto fail;
+    }
+
+    aaudio->pci = dev;
+    pci_set_drvdata(dev, aaudio);
+
+    aaudio->devt = aaudio_chrdev;
+    aaudio->dev = device_create(aaudio_class, &dev->dev, aaudio->devt, NULL, "aaudio");
+    if (IS_ERR_OR_NULL(aaudio->dev)) {
+        status = PTR_ERR(aaudio_class);
+        goto fail;
+    }
+    device_link_add(aaudio->dev, aaudio->bce->dev, DL_FLAG_PM_RUNTIME | DL_FLAG_AUTOREMOVE_CONSUMER);
+
+    init_completion(&aaudio->remote_alive);
+    INIT_LIST_HEAD(&aaudio->subdevice_list);
+
+    /* Init: set an unknown flag in the bitset */
+    if (pci_read_config_dword(dev, 4, &cfg))
+        dev_warn(&dev->dev, "aaudio: pci_read_config_dword fail\n");
+    if (pci_write_config_dword(dev, 4, cfg | 6u))
+        dev_warn(&dev->dev, "aaudio: pci_write_config_dword fail\n");
+
+    dev_info(aaudio->dev, "aaudio: bs len = %llx\n", pci_resource_len(dev, 0));
+    aaudio->reg_mem_bs_dma = pci_resource_start(dev, 0);
+    aaudio->reg_mem_bs = pci_iomap(dev, 0, 0);
+    aaudio->reg_mem_cfg = pci_iomap(dev, 4, 0);
+
+    aaudio->reg_mem_gpr = (u32 __iomem *) ((u8 __iomem *) aaudio->reg_mem_cfg + 0xC000);
+
+    if (IS_ERR_OR_NULL(aaudio->reg_mem_bs) || IS_ERR_OR_NULL(aaudio->reg_mem_cfg)) {
+        dev_warn(&dev->dev, "aaudio: Failed to pci_iomap required regions\n");
+        goto fail;
+    }
+
+    if (aaudio_bce_init(aaudio)) {
+        dev_warn(&dev->dev, "aaudio: Failed to init BCE command transport\n");
+        goto fail;
+    }
+
+    if (snd_card_new(aaudio->dev, aaudio_alsa_index, aaudio_alsa_id, THIS_MODULE, 0, &aaudio->card)) {
+        dev_err(&dev->dev, "aaudio: Failed to create ALSA card\n");
+        goto fail;
+    }
+
+    strcpy(aaudio->card->shortname, "Apple T2 Audio");
+    strcpy(aaudio->card->longname, "Apple T2 Audio");
+    strcpy(aaudio->card->mixername, "Apple T2 Audio");
+    /* Dynamic alsa ids start at 100 */
+    aaudio->next_alsa_id = 100;
+
+    if (aaudio_init_cmd(aaudio)) {
+        dev_err(&dev->dev, "aaudio: Failed to initialize over BCE\n");
+        goto fail_snd;
+    }
+
+    if (aaudio_init_bs(aaudio)) {
+        dev_err(&dev->dev, "aaudio: Failed to initialize BufferStruct\n");
+        goto fail_snd;
+    }
+
+    if ((status = aaudio_cmd_set_remote_access(aaudio, AAUDIO_REMOTE_ACCESS_ON))) {
+        dev_err(&dev->dev, "Failed to set remote access\n");
+        return status;
+    }
+
+    if (snd_card_register(aaudio->card)) {
+        dev_err(&dev->dev, "aaudio: Failed to register ALSA sound device\n");
+        goto fail_snd;
+    }
+
+    list_for_each_entry(sdev, &aaudio->subdevice_list, list) {
+        struct aaudio_buffer_struct_device *dev = &aaudio->bs->devices[sdev->buf_id];
+
+        if (sdev->out_stream_cnt == 1 && !strcmp(dev->name, "Speaker")) {
+            struct snd_pcm_hardware *hw = sdev->out_streams[0].alsa_hw_desc;
+
+            snprintf(aaudio->card->driver, sizeof(aaudio->card->driver) / sizeof(char), "AppleT2x%d", hw->channels_min);
+        }
+    }
+
+    return 0;
+
+fail_snd:
+    snd_card_free(aaudio->card);
+fail:
+    if (aaudio && aaudio->dev)
+        device_destroy(aaudio_class, aaudio->devt);
+    kfree(aaudio);
+
+    if (!IS_ERR_OR_NULL(aaudio->reg_mem_bs))
+        pci_iounmap(dev, aaudio->reg_mem_bs);
+    if (!IS_ERR_OR_NULL(aaudio->reg_mem_cfg))
+        pci_iounmap(dev, aaudio->reg_mem_cfg);
+
+    pci_release_regions(dev);
+    pci_disable_device(dev);
+
+    if (!status)
+        status = -EINVAL;
+    return status;
+}
+
+
+
+static void aaudio_remove(struct pci_dev *dev)
+{
+    struct aaudio_subdevice *sdev;
+    struct aaudio_device *aaudio = pci_get_drvdata(dev);
+
+    snd_card_free(aaudio->card);
+    while (!list_empty(&aaudio->subdevice_list)) {
+        sdev = list_first_entry(&aaudio->subdevice_list, struct aaudio_subdevice, list);
+        list_del(&sdev->list);
+        aaudio_free_dev(sdev);
+    }
+    pci_iounmap(dev, aaudio->reg_mem_bs);
+    pci_iounmap(dev, aaudio->reg_mem_cfg);
+    device_destroy(aaudio_class, aaudio->devt);
+    pci_free_irq_vectors(dev);
+    pci_release_regions(dev);
+    pci_disable_device(dev);
+    kfree(aaudio);
+}
+
+static int aaudio_suspend(struct device *dev)
+{
+    struct aaudio_device *aaudio = pci_get_drvdata(to_pci_dev(dev));
+
+    if (aaudio_cmd_set_remote_access(aaudio, AAUDIO_REMOTE_ACCESS_OFF))
+        dev_warn(aaudio->dev, "Failed to reset remote access\n");
+
+    pci_disable_device(aaudio->pci);
+    return 0;
+}
+
+static int aaudio_resume(struct device *dev)
+{
+    int status;
+    struct aaudio_device *aaudio = pci_get_drvdata(to_pci_dev(dev));
+
+    if ((status = pci_enable_device(aaudio->pci)))
+        return status;
+    pci_set_master(aaudio->pci);
+
+    if ((status = aaudio_cmd_set_remote_access(aaudio, AAUDIO_REMOTE_ACCESS_ON))) {
+        dev_err(aaudio->dev, "Failed to set remote access\n");
+        return status;
+    }
+
+    return 0;
+}
+
+static int aaudio_init_cmd(struct aaudio_device *a)
+{
+    int status;
+    struct aaudio_send_ctx sctx;
+    struct aaudio_msg buf;
+    u64 dev_cnt, dev_i;
+    aaudio_device_id_t *dev_l;
+
+    if ((status = aaudio_send(a, &sctx, 500,
+                              aaudio_msg_write_alive_notification, 1, 3))) {
+        dev_err(a->dev, "Sending alive notification failed\n");
+        return status;
+    }
+
+    if (wait_for_completion_timeout(&a->remote_alive, msecs_to_jiffies(500)) == 0) {
+        dev_err(a->dev, "Timed out waiting for remote\n");
+        return -ETIMEDOUT;
+    }
+    dev_info(a->dev, "Continuing init\n");
+
+    buf = aaudio_reply_alloc();
+    if ((status = aaudio_cmd_get_device_list(a, &buf, &dev_l, &dev_cnt))) {
+        dev_err(a->dev, "Failed to get device list\n");
+        aaudio_reply_free(&buf);
+        return status;
+    }
+    for (dev_i = 0; dev_i < dev_cnt; ++dev_i)
+        aaudio_init_dev(a, dev_l[dev_i]);
+    aaudio_reply_free(&buf);
+
+    return 0;
+}
+
+static void aaudio_init_stream_info(struct aaudio_subdevice *sdev, struct aaudio_stream *strm);
+static void aaudio_handle_jack_connection_change(struct aaudio_subdevice *sdev);
+
+static void aaudio_init_dev(struct aaudio_device *a, aaudio_device_id_t dev_id)
+{
+    struct aaudio_subdevice *sdev;
+    struct aaudio_msg buf = aaudio_reply_alloc();
+    u64 uid_len, stream_cnt, i;
+    aaudio_object_id_t *stream_list;
+    char *uid;
+
+    sdev = kzalloc(sizeof(struct aaudio_subdevice), GFP_KERNEL);
+
+    if (aaudio_cmd_get_property(a, &buf, dev_id, dev_id, AAUDIO_PROP(AAUDIO_PROP_SCOPE_GLOBAL, AAUDIO_PROP_UID, 0),
+            NULL, 0, (void **) &uid, &uid_len) || uid_len > AAUDIO_DEVICE_MAX_UID_LEN) {
+        dev_err(a->dev, "Failed to get device uid for device %llx\n", dev_id);
+        goto fail;
+    }
+    dev_info(a->dev, "Remote device %llx %.*s\n", dev_id, (int) uid_len, uid);
+
+    sdev->a = a;
+    INIT_LIST_HEAD(&sdev->list);
+    sdev->dev_id = dev_id;
+    sdev->buf_id = AAUDIO_BUFFER_ID_NONE;
+    strncpy(sdev->uid, uid, uid_len);
+    sdev->uid[uid_len + 1] = '\0';
+
+    if (aaudio_cmd_get_primitive_property(a, dev_id, dev_id,
+            AAUDIO_PROP(AAUDIO_PROP_SCOPE_INPUT, AAUDIO_PROP_LATENCY, 0), NULL, 0, &sdev->in_latency, sizeof(u32)))
+        dev_warn(a->dev, "Failed to query device input latency\n");
+    if (aaudio_cmd_get_primitive_property(a, dev_id, dev_id,
+            AAUDIO_PROP(AAUDIO_PROP_SCOPE_OUTPUT, AAUDIO_PROP_LATENCY, 0), NULL, 0, &sdev->out_latency, sizeof(u32)))
+        dev_warn(a->dev, "Failed to query device output latency\n");
+
+    if (aaudio_cmd_get_input_stream_list(a, &buf, dev_id, &stream_list, &stream_cnt)) {
+        dev_err(a->dev, "Failed to get input stream list for device %llx\n", dev_id);
+        goto fail;
+    }
+    if (stream_cnt > AAUDIO_DEIVCE_MAX_INPUT_STREAMS) {
+        dev_warn(a->dev, "Device %s input stream count %llu is larger than the supported count of %u\n",
+                sdev->uid, stream_cnt, AAUDIO_DEIVCE_MAX_INPUT_STREAMS);
+        stream_cnt = AAUDIO_DEIVCE_MAX_INPUT_STREAMS;
+    }
+    sdev->in_stream_cnt = stream_cnt;
+    for (i = 0; i < stream_cnt; i++) {
+        sdev->in_streams[i].id = stream_list[i];
+        sdev->in_streams[i].buffer_cnt = 0;
+        aaudio_init_stream_info(sdev, &sdev->in_streams[i]);
+        sdev->in_streams[i].latency += sdev->in_latency;
+    }
+
+    if (aaudio_cmd_get_output_stream_list(a, &buf, dev_id, &stream_list, &stream_cnt)) {
+        dev_err(a->dev, "Failed to get output stream list for device %llx\n", dev_id);
+        goto fail;
+    }
+    if (stream_cnt > AAUDIO_DEIVCE_MAX_OUTPUT_STREAMS) {
+        dev_warn(a->dev, "Device %s input stream count %llu is larger than the supported count of %u\n",
+                 sdev->uid, stream_cnt, AAUDIO_DEIVCE_MAX_OUTPUT_STREAMS);
+        stream_cnt = AAUDIO_DEIVCE_MAX_OUTPUT_STREAMS;
+    }
+    sdev->out_stream_cnt = stream_cnt;
+    for (i = 0; i < stream_cnt; i++) {
+        sdev->out_streams[i].id = stream_list[i];
+        sdev->out_streams[i].buffer_cnt = 0;
+        aaudio_init_stream_info(sdev, &sdev->out_streams[i]);
+        sdev->out_streams[i].latency += sdev->in_latency;
+    }
+
+    if (sdev->is_pcm)
+        aaudio_create_pcm(sdev);
+    /* Headphone Jack status */
+    if (!strcmp(sdev->uid, "Codec Output")) {
+        if (snd_jack_new(a->card, sdev->uid, SND_JACK_HEADPHONE, &sdev->jack, true, false))
+            dev_warn(a->dev, "Failed to create an attached jack for %s\n", sdev->uid);
+        aaudio_cmd_property_listener(a, sdev->dev_id, sdev->dev_id,
+                AAUDIO_PROP(AAUDIO_PROP_SCOPE_OUTPUT, AAUDIO_PROP_JACK_PLUGGED, 0));
+        aaudio_handle_jack_connection_change(sdev);
+    }
+
+    aaudio_reply_free(&buf);
+
+    list_add_tail(&sdev->list, &a->subdevice_list);
+    return;
+
+fail:
+    aaudio_reply_free(&buf);
+    kfree(sdev);
+}
+
+static void aaudio_init_stream_info(struct aaudio_subdevice *sdev, struct aaudio_stream *strm)
+{
+    if (aaudio_cmd_get_primitive_property(sdev->a, sdev->dev_id, strm->id,
+            AAUDIO_PROP(AAUDIO_PROP_SCOPE_GLOBAL, AAUDIO_PROP_PHYS_FORMAT, 0), NULL, 0,
+            &strm->desc, sizeof(strm->desc)))
+        dev_warn(sdev->a->dev, "Failed to query stream descriptor\n");
+    if (aaudio_cmd_get_primitive_property(sdev->a, sdev->dev_id, strm->id,
+            AAUDIO_PROP(AAUDIO_PROP_SCOPE_GLOBAL, AAUDIO_PROP_LATENCY, 0), NULL, 0, &strm->latency, sizeof(u32)))
+        dev_warn(sdev->a->dev, "Failed to query stream latency\n");
+    if (strm->desc.format_id == AAUDIO_FORMAT_LPCM)
+        sdev->is_pcm = true;
+}
+
+static void aaudio_free_dev(struct aaudio_subdevice *sdev)
+{
+    size_t i;
+    for (i = 0; i < sdev->in_stream_cnt; i++) {
+        if (sdev->in_streams[i].alsa_hw_desc)
+            kfree(sdev->in_streams[i].alsa_hw_desc);
+        if (sdev->in_streams[i].buffers)
+            kfree(sdev->in_streams[i].buffers);
+    }
+    for (i = 0; i < sdev->out_stream_cnt; i++) {
+        if (sdev->out_streams[i].alsa_hw_desc)
+            kfree(sdev->out_streams[i].alsa_hw_desc);
+        if (sdev->out_streams[i].buffers)
+            kfree(sdev->out_streams[i].buffers);
+    }
+    kfree(sdev);
+}
+
+static struct aaudio_subdevice *aaudio_find_dev_by_dev_id(struct aaudio_device *a, aaudio_device_id_t dev_id)
+{
+    struct aaudio_subdevice *sdev;
+    list_for_each_entry(sdev, &a->subdevice_list, list) {
+        if (dev_id == sdev->dev_id)
+            return sdev;
+    }
+    return NULL;
+}
+
+static struct aaudio_subdevice *aaudio_find_dev_by_uid(struct aaudio_device *a, const char *uid)
+{
+    struct aaudio_subdevice *sdev;
+    list_for_each_entry(sdev, &a->subdevice_list, list) {
+        if (!strcmp(uid, sdev->uid))
+            return sdev;
+    }
+    return NULL;
+}
+
+static void aaudio_init_bs_stream(struct aaudio_device *a, struct aaudio_stream *strm,
+        struct aaudio_buffer_struct_stream *bs_strm);
+static void aaudio_init_bs_stream_host(struct aaudio_device *a, struct aaudio_stream *strm,
+        struct aaudio_buffer_struct_stream *bs_strm);
+
+static int aaudio_init_bs(struct aaudio_device *a)
+{
+    int i, j;
+    struct aaudio_buffer_struct_device *dev;
+    struct aaudio_subdevice *sdev;
+    u32 ver, sig, bs_base;
+
+    ver = ioread32(&a->reg_mem_gpr[0]);
+    if (ver < 3) {
+        dev_err(a->dev, "aaudio: Bad GPR version (%u)", ver);
+        return -EINVAL;
+    }
+    sig = ioread32(&a->reg_mem_gpr[1]);
+    if (sig != AAUDIO_SIG) {
+        dev_err(a->dev, "aaudio: Bad GPR sig (%x)", sig);
+        return -EINVAL;
+    }
+    bs_base = ioread32(&a->reg_mem_gpr[2]);
+    a->bs = (struct aaudio_buffer_struct *) ((u8 *) a->reg_mem_bs + bs_base);
+    if (a->bs->signature != AAUDIO_SIG) {
+        dev_err(a->dev, "aaudio: Bad BufferStruct sig (%x)", a->bs->signature);
+        return -EINVAL;
+    }
+    dev_info(a->dev, "aaudio: BufferStruct ver = %i\n", a->bs->version);
+    dev_info(a->dev, "aaudio: Num devices = %i\n", a->bs->num_devices);
+    for (i = 0; i < a->bs->num_devices; i++) {
+        dev = &a->bs->devices[i];
+        dev_info(a->dev, "aaudio: Device %i %s\n", i, dev->name);
+
+        sdev = aaudio_find_dev_by_uid(a, dev->name);
+        if (!sdev) {
+            dev_err(a->dev, "aaudio: Subdevice not found for BufferStruct device %s\n", dev->name);
+            continue;
+        }
+        sdev->buf_id = (u8) i;
+        dev->num_input_streams = 0;
+        for (j = 0; j < dev->num_output_streams; j++) {
+            dev_info(a->dev, "aaudio: Device %i Stream %i: Output; Buffer Count = %i\n", i, j,
+                     dev->output_streams[j].num_buffers);
+            if (j < sdev->out_stream_cnt)
+                aaudio_init_bs_stream(a, &sdev->out_streams[j], &dev->output_streams[j]);
+        }
+    }
+
+    list_for_each_entry(sdev, &a->subdevice_list, list) {
+        if (sdev->buf_id != AAUDIO_BUFFER_ID_NONE)
+            continue;
+        sdev->buf_id = i;
+        dev_info(a->dev, "aaudio: Created device %i %s\n", i, sdev->uid);
+        strcpy(a->bs->devices[i].name, sdev->uid);
+        a->bs->devices[i].num_input_streams = 0;
+        a->bs->devices[i].num_output_streams = 0;
+        a->bs->num_devices = ++i;
+    }
+    list_for_each_entry(sdev, &a->subdevice_list, list) {
+        if (sdev->in_stream_cnt == 1) {
+            dev_info(a->dev, "aaudio: Device %i Host Stream; Input\n", sdev->buf_id);
+            aaudio_init_bs_stream_host(a, &sdev->in_streams[0], &a->bs->devices[sdev->buf_id].input_streams[0]);
+            a->bs->devices[sdev->buf_id].num_input_streams = 1;
+            wmb();
+
+            if (aaudio_cmd_set_input_stream_address_ranges(a, sdev->dev_id)) {
+                dev_err(a->dev, "aaudio: Failed to set input stream address ranges\n");
+            }
+        }
+    }
+
+    return 0;
+}
+
+static void aaudio_init_bs_stream(struct aaudio_device *a, struct aaudio_stream *strm,
+                                  struct aaudio_buffer_struct_stream *bs_strm)
+{
+    size_t i;
+    strm->buffer_cnt = bs_strm->num_buffers;
+    if (bs_strm->num_buffers > AAUDIO_DEIVCE_MAX_BUFFER_COUNT) {
+        dev_warn(a->dev, "BufferStruct buffer count %u exceeds driver limit of %u\n", bs_strm->num_buffers,
+                AAUDIO_DEIVCE_MAX_BUFFER_COUNT);
+        strm->buffer_cnt = AAUDIO_DEIVCE_MAX_BUFFER_COUNT;
+    }
+    if (!strm->buffer_cnt)
+        return;
+    strm->buffers = kmalloc_array(strm->buffer_cnt, sizeof(struct aaudio_dma_buf), GFP_KERNEL);
+    if (!strm->buffers) {
+        dev_err(a->dev, "Buffer list allocation failed\n");
+        return;
+    }
+    for (i = 0; i < strm->buffer_cnt; i++) {
+        strm->buffers[i].dma_addr = a->reg_mem_bs_dma + (dma_addr_t) bs_strm->buffers[i].address;
+        strm->buffers[i].ptr = a->reg_mem_bs + bs_strm->buffers[i].address;
+        strm->buffers[i].size = bs_strm->buffers[i].size;
+    }
+
+    if (strm->buffer_cnt == 1) {
+        strm->alsa_hw_desc = kmalloc(sizeof(struct snd_pcm_hardware), GFP_KERNEL);
+        if (aaudio_create_hw_info(&strm->desc, strm->alsa_hw_desc, strm->buffers[0].size)) {
+            kfree(strm->alsa_hw_desc);
+            strm->alsa_hw_desc = NULL;
+        }
+    }
+}
+
+static void aaudio_init_bs_stream_host(struct aaudio_device *a, struct aaudio_stream *strm,
+        struct aaudio_buffer_struct_stream *bs_strm)
+{
+    size_t size;
+    dma_addr_t dma_addr;
+    void *dma_ptr;
+    size = strm->desc.bytes_per_packet * 16640;
+    dma_ptr = dma_alloc_coherent(&a->pci->dev, size, &dma_addr, GFP_KERNEL);
+    if (!dma_ptr) {
+        dev_err(a->dev, "dma_alloc_coherent failed\n");
+        return;
+    }
+    bs_strm->buffers[0].address = dma_addr;
+    bs_strm->buffers[0].size = size;
+    bs_strm->num_buffers = 1;
+
+    memset(dma_ptr, 0, size);
+
+    strm->buffer_cnt = 1;
+    strm->buffers = kmalloc_array(strm->buffer_cnt, sizeof(struct aaudio_dma_buf), GFP_KERNEL);
+    if (!strm->buffers) {
+        dev_err(a->dev, "Buffer list allocation failed\n");
+        return;
+    }
+    strm->buffers[0].dma_addr = dma_addr;
+    strm->buffers[0].ptr = dma_ptr;
+    strm->buffers[0].size = size;
+
+    strm->alsa_hw_desc = kmalloc(sizeof(struct snd_pcm_hardware), GFP_KERNEL);
+    if (aaudio_create_hw_info(&strm->desc, strm->alsa_hw_desc, strm->buffers[0].size)) {
+        kfree(strm->alsa_hw_desc);
+        strm->alsa_hw_desc = NULL;
+    }
+}
+
+static void aaudio_handle_prop_change(struct aaudio_device *a, struct aaudio_msg *msg);
+
+void aaudio_handle_notification(struct aaudio_device *a, struct aaudio_msg *msg)
+{
+    struct aaudio_send_ctx sctx;
+    struct aaudio_msg_base base;
+    if (aaudio_msg_read_base(msg, &base))
+        return;
+    switch (base.msg) {
+        case AAUDIO_MSG_NOTIFICATION_BOOT:
+            dev_info(a->dev, "Received boot notification from remote\n");
+
+            /* Resend the alive notify */
+            if (aaudio_send(a, &sctx, 500,
+                    aaudio_msg_write_alive_notification, 1, 3)) {
+                pr_err("Sending alive notification failed\n");
+            }
+            break;
+        case AAUDIO_MSG_NOTIFICATION_ALIVE:
+            dev_info(a->dev, "Received alive notification from remote\n");
+            complete_all(&a->remote_alive);
+            break;
+        case AAUDIO_MSG_PROPERTY_CHANGED:
+            aaudio_handle_prop_change(a, msg);
+            break;
+        default:
+            dev_info(a->dev, "Unhandled notification %i", base.msg);
+            break;
+    }
+}
+
+struct aaudio_prop_change_work_struct {
+    struct work_struct ws;
+    struct aaudio_device *a;
+    aaudio_device_id_t dev;
+    aaudio_object_id_t obj;
+    struct aaudio_prop_addr prop;
+};
+
+static void aaudio_handle_jack_connection_change(struct aaudio_subdevice *sdev)
+{
+    u32 plugged;
+    if (!sdev->jack)
+        return;
+    /* NOTE: Apple made the plug status scoped to the input and output streams. This makes no sense for us, so I just
+     * always pick the OUTPUT status. */
+    if (aaudio_cmd_get_primitive_property(sdev->a, sdev->dev_id, sdev->dev_id,
+            AAUDIO_PROP(AAUDIO_PROP_SCOPE_OUTPUT, AAUDIO_PROP_JACK_PLUGGED, 0), NULL, 0, &plugged, sizeof(plugged))) {
+        dev_err(sdev->a->dev, "Failed to get jack enable status\n");
+        return;
+    }
+    dev_dbg(sdev->a->dev, "Jack is now %s\n", plugged ? "plugged" : "unplugged");
+    snd_jack_report(sdev->jack, plugged ? sdev->jack->type : 0);
+}
+
+void aaudio_handle_prop_change_work(struct work_struct *ws)
+{
+    struct aaudio_prop_change_work_struct *work = container_of(ws, struct aaudio_prop_change_work_struct, ws);
+    struct aaudio_subdevice *sdev;
+
+    sdev = aaudio_find_dev_by_dev_id(work->a, work->dev);
+    if (!sdev) {
+        dev_err(work->a->dev, "Property notification change: device not found\n");
+        goto done;
+    }
+    dev_dbg(work->a->dev, "Property changed for device: %s\n", sdev->uid);
+
+    if (work->prop.scope == AAUDIO_PROP_SCOPE_OUTPUT && work->prop.selector == AAUDIO_PROP_JACK_PLUGGED) {
+        aaudio_handle_jack_connection_change(sdev);
+    }
+
+done:
+    kfree(work);
+}
+
+void aaudio_handle_prop_change(struct aaudio_device *a, struct aaudio_msg *msg)
+{
+    /* NOTE: This is a scheduled work because this callback will generally need to query device information and this
+     * is not possible when we are in the reply parsing code's context. */
+    struct aaudio_prop_change_work_struct *work;
+    work = kmalloc(sizeof(struct aaudio_prop_change_work_struct), GFP_KERNEL);
+    work->a = a;
+    INIT_WORK(&work->ws, aaudio_handle_prop_change_work);
+    aaudio_msg_read_property_changed(msg, &work->dev, &work->obj, &work->prop);
+    schedule_work(&work->ws);
+}
+
+#define aaudio_send_cmd_response(a, sctx, msg, fn, ...) \
+    if (aaudio_send_with_tag(a, sctx, ((struct aaudio_msg_header *) msg->data)->tag, 500, fn, ##__VA_ARGS__)) \
+        pr_err("aaudio: Failed to reply to a command\n");
+
+void aaudio_handle_cmd_timestamp(struct aaudio_device *a, struct aaudio_msg *msg)
+{
+    ktime_t time_os = ktime_get_boottime();
+    struct aaudio_send_ctx sctx;
+    struct aaudio_subdevice *sdev;
+    u64 devid, timestamp, update_seed;
+    aaudio_msg_read_update_timestamp(msg, &devid, &timestamp, &update_seed);
+    dev_dbg(a->dev, "Received timestamp update for dev=%llx ts=%llx seed=%llx\n", devid, timestamp, update_seed);
+
+    sdev = aaudio_find_dev_by_dev_id(a, devid);
+    aaudio_handle_timestamp(sdev, time_os, timestamp);
+
+    aaudio_send_cmd_response(a, &sctx, msg,
+            aaudio_msg_write_update_timestamp_response);
+}
+
+void aaudio_handle_command(struct aaudio_device *a, struct aaudio_msg *msg)
+{
+    struct aaudio_msg_base base;
+    if (aaudio_msg_read_base(msg, &base))
+        return;
+    switch (base.msg) {
+        case AAUDIO_MSG_UPDATE_TIMESTAMP:
+            aaudio_handle_cmd_timestamp(a, msg);
+            break;
+        default:
+            dev_info(a->dev, "Unhandled device command %i", base.msg);
+            break;
+    }
+}
+
+static struct pci_device_id aaudio_ids[  ] = {
+        { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x1803) },
+        { 0, },
+};
+
+struct dev_pm_ops aaudio_pci_driver_pm = {
+        .suspend = aaudio_suspend,
+        .resume = aaudio_resume
+};
+struct pci_driver aaudio_pci_driver = {
+        .name = "aaudio",
+        .id_table = aaudio_ids,
+        .probe = aaudio_probe,
+        .remove = aaudio_remove,
+        .driver = {
+                .pm = &aaudio_pci_driver_pm
+        }
+};
+
+
+int aaudio_module_init(void)
+{
+    int result;
+    if ((result = alloc_chrdev_region(&aaudio_chrdev, 0, 1, "aaudio")))
+        goto fail_chrdev;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6,4,0)
+    aaudio_class = class_create(THIS_MODULE, "aaudio");
+#else
+    aaudio_class = class_create("aaudio");
+#endif
+    if (IS_ERR(aaudio_class)) {
+        result = PTR_ERR(aaudio_class);
+        goto fail_class;
+    }
+    
+    result = pci_register_driver(&aaudio_pci_driver);
+    if (result)
+        goto fail_drv;
+    return 0;
+
+fail_drv:
+    pci_unregister_driver(&aaudio_pci_driver);
+fail_class:
+    class_destroy(aaudio_class);
+fail_chrdev:
+    unregister_chrdev_region(aaudio_chrdev, 1);
+    if (!result)
+        result = -EINVAL;
+    return result;
+}
+
+void aaudio_module_exit(void)
+{
+    pci_unregister_driver(&aaudio_pci_driver);
+    class_destroy(aaudio_class);
+    unregister_chrdev_region(aaudio_chrdev, 1);
+}
+
+struct aaudio_alsa_pcm_id_mapping aaudio_alsa_id_mappings[] = {
+        {"Speaker", 0},
+        {"Digital Mic", 1},
+        {"Codec Output", 2},
+        {"Codec Input", 3},
+        {"Bridge Loopback", 4},
+        {}
+};
+
+module_param_named(index, aaudio_alsa_index, int, 0444);
+MODULE_PARM_DESC(index, "Index value for Apple Internal Audio soundcard.");
+module_param_named(id, aaudio_alsa_id, charp, 0444);
+MODULE_PARM_DESC(id, "ID string for Apple Internal Audio soundcard.");
diff --git a/drivers/staging/apple-bce/audio/audio.h b/drivers/staging/apple-bce/audio/audio.h
new file mode 100644
index 000000000000..004bc1e22ea4
--- /dev/null
+++ b/drivers/staging/apple-bce/audio/audio.h
@@ -0,0 +1,125 @@
+#ifndef AAUDIO_H
+#define AAUDIO_H
+
+#include <linux/types.h>
+#include <sound/pcm.h>
+#include "../apple_bce.h"
+#include "protocol_bce.h"
+#include "description.h"
+
+#define AAUDIO_SIG 0x19870423
+
+#define AAUDIO_DEVICE_MAX_UID_LEN 128
+#define AAUDIO_DEIVCE_MAX_INPUT_STREAMS 1
+#define AAUDIO_DEIVCE_MAX_OUTPUT_STREAMS 1
+#define AAUDIO_DEIVCE_MAX_BUFFER_COUNT 1
+
+#define AAUDIO_BUFFER_ID_NONE 0xffu
+
+struct snd_card;
+struct snd_pcm;
+struct snd_pcm_hardware;
+struct snd_jack;
+
+struct __attribute__((packed)) __attribute__((aligned(4))) aaudio_buffer_struct_buffer {
+    size_t address;
+    size_t size;
+    size_t pad[4];
+};
+struct aaudio_buffer_struct_stream {
+    u8 num_buffers;
+    struct aaudio_buffer_struct_buffer buffers[100];
+    char filler[32];
+};
+struct aaudio_buffer_struct_device {
+    char name[128];
+    u8 num_input_streams;
+    u8 num_output_streams;
+    struct aaudio_buffer_struct_stream input_streams[5];
+    struct aaudio_buffer_struct_stream output_streams[5];
+    char filler[128];
+};
+struct aaudio_buffer_struct {
+    u32 version;
+    u32 signature;
+    u32 flags;
+    u8 num_devices;
+    struct aaudio_buffer_struct_device devices[20];
+};
+
+struct aaudio_device;
+struct aaudio_dma_buf {
+    dma_addr_t dma_addr;
+    void *ptr;
+    size_t size;
+};
+struct aaudio_stream {
+    aaudio_object_id_t id;
+    size_t buffer_cnt;
+    struct aaudio_dma_buf *buffers;
+
+    struct aaudio_apple_description desc;
+    struct snd_pcm_hardware *alsa_hw_desc;
+    u32 latency;
+
+    bool waiting_for_first_ts;
+
+    ktime_t remote_timestamp;
+    snd_pcm_sframes_t frame_min;
+    int started;
+};
+struct aaudio_subdevice {
+    struct aaudio_device *a;
+    struct list_head list;
+    aaudio_device_id_t dev_id;
+    u32 in_latency, out_latency;
+    u8 buf_id;
+    int alsa_id;
+    char uid[AAUDIO_DEVICE_MAX_UID_LEN + 1];
+    size_t in_stream_cnt;
+    struct aaudio_stream in_streams[AAUDIO_DEIVCE_MAX_INPUT_STREAMS];
+    size_t out_stream_cnt;
+    struct aaudio_stream out_streams[AAUDIO_DEIVCE_MAX_OUTPUT_STREAMS];
+    bool is_pcm;
+    struct snd_pcm *pcm;
+    struct snd_jack *jack;
+};
+struct aaudio_alsa_pcm_id_mapping {
+    const char *name;
+    int alsa_id;
+};
+
+struct aaudio_device {
+    struct pci_dev *pci;
+    dev_t devt;
+    struct device *dev;
+    void __iomem *reg_mem_bs;
+    dma_addr_t reg_mem_bs_dma;
+    void __iomem *reg_mem_cfg;
+
+    u32 __iomem *reg_mem_gpr;
+
+    struct aaudio_buffer_struct *bs;
+
+    struct apple_bce_device *bce;
+    struct aaudio_bce bcem;
+
+    struct snd_card *card;
+
+    struct list_head subdevice_list;
+    int next_alsa_id;
+
+    struct completion remote_alive;
+};
+
+void aaudio_handle_notification(struct aaudio_device *a, struct aaudio_msg *msg);
+void aaudio_handle_prop_change_work(struct work_struct *ws);
+void aaudio_handle_cmd_timestamp(struct aaudio_device *a, struct aaudio_msg *msg);
+void aaudio_handle_command(struct aaudio_device *a, struct aaudio_msg *msg);
+
+int aaudio_module_init(void);
+void aaudio_module_exit(void);
+
+extern struct aaudio_alsa_pcm_id_mapping aaudio_alsa_id_mappings[];
+
+#endif //AAUDIO_H
diff --git a/drivers/staging/apple-bce/audio/description.h b/drivers/staging/apple-bce/audio/description.h
new file mode 100644
index 000000000000..dfef3ab68f27
--- /dev/null
+++ b/drivers/staging/apple-bce/audio/description.h
@@ -0,0 +1,42 @@
+#ifndef AAUDIO_DESCRIPTION_H
+#define AAUDIO_DESCRIPTION_H
+
+#include <linux/types.h>
+
+struct aaudio_apple_description {
+    u64 sample_rate_double;
+    u32 format_id;
+    u32 format_flags;
+    u32 bytes_per_packet;
+    u32 frames_per_packet;
+    u32 bytes_per_frame;
+    u32 channels_per_frame;
+    u32 bits_per_channel;
+    u32 reserved;
+};
+
+enum {
+    AAUDIO_FORMAT_LPCM = 0x6c70636d  // 'lpcm'
+};
+
+enum {
+    AAUDIO_FORMAT_FLAG_FLOAT = 1,
+    AAUDIO_FORMAT_FLAG_BIG_ENDIAN = 2,
+    AAUDIO_FORMAT_FLAG_SIGNED = 4,
+    AAUDIO_FORMAT_FLAG_PACKED = 8,
+    AAUDIO_FORMAT_FLAG_ALIGNED_HIGH = 16,
+    AAUDIO_FORMAT_FLAG_NON_INTERLEAVED = 32,
+    AAUDIO_FORMAT_FLAG_NON_MIXABLE = 64
+};
+
+static inline u64 aaudio_double_to_u64(u64 d)
+{
+    u8 sign = (u8) ((d >> 63) & 1);
+    s32 exp = (s32) ((d >> 52) & 0x7ff) - 1023;
+    u64 fr = d & ((1LL << 52) - 1);
+    if (sign || exp < 0)
+        return 0;
+    return (u64) ((1LL << exp) + (fr >> (52 - exp)));
+}
+
+#endif //AAUDIO_DESCRIPTION_H
diff --git a/drivers/staging/apple-bce/audio/pcm.c b/drivers/staging/apple-bce/audio/pcm.c
new file mode 100644
index 000000000000..1026e10a9ac5
--- /dev/null
+++ b/drivers/staging/apple-bce/audio/pcm.c
@@ -0,0 +1,308 @@
+#include "pcm.h"
+#include "audio.h"
+
+static u64 aaudio_get_alsa_fmtbit(struct aaudio_apple_description *desc)
+{
+    if (desc->format_flags & AAUDIO_FORMAT_FLAG_FLOAT) {
+        if (desc->bits_per_channel == 32) {
+            if (desc->format_flags & AAUDIO_FORMAT_FLAG_BIG_ENDIAN)
+                return SNDRV_PCM_FMTBIT_FLOAT_BE;
+            else
+                return SNDRV_PCM_FMTBIT_FLOAT_LE;
+        } else if (desc->bits_per_channel == 64) {
+            if (desc->format_flags & AAUDIO_FORMAT_FLAG_BIG_ENDIAN)
+                return SNDRV_PCM_FMTBIT_FLOAT64_BE;
+            else
+                return SNDRV_PCM_FMTBIT_FLOAT64_LE;
+        } else {
+            pr_err("aaudio: unsupported bits per channel for float format: %u\n", desc->bits_per_channel);
+            return 0;
+        }
+    }
+#define DEFINE_BPC_OPTION(val, b) \
+    case val: \
+        if (desc->format_flags & AAUDIO_FORMAT_FLAG_BIG_ENDIAN) { \
+            if (desc->format_flags & AAUDIO_FORMAT_FLAG_SIGNED) \
+                return SNDRV_PCM_FMTBIT_S ## b ## BE; \
+            else \
+                return SNDRV_PCM_FMTBIT_U ## b ## BE; \
+        } else { \
+            if (desc->format_flags & AAUDIO_FORMAT_FLAG_SIGNED) \
+                return SNDRV_PCM_FMTBIT_S ## b ## LE; \
+            else \
+                return SNDRV_PCM_FMTBIT_U ## b ## LE; \
+        }
+    if (desc->format_flags & AAUDIO_FORMAT_FLAG_PACKED) {
+        switch (desc->bits_per_channel) {
+            case 8:
+            case 16:
+            case 32:
+                break;
+            DEFINE_BPC_OPTION(24, 24_3)
+            default:
+                pr_err("aaudio: unsupported bits per channel for packed format: %u\n", desc->bits_per_channel);
+                return 0;
+        }
+    }
+    if (desc->format_flags & AAUDIO_FORMAT_FLAG_ALIGNED_HIGH) {
+        switch (desc->bits_per_channel) {
+            DEFINE_BPC_OPTION(24, 32_)
+            default:
+                pr_err("aaudio: unsupported bits per channel for high-aligned format: %u\n", desc->bits_per_channel);
+                return 0;
+        }
+    }
+    switch (desc->bits_per_channel) {
+        case 8:
+            if (desc->format_flags & AAUDIO_FORMAT_FLAG_SIGNED)
+                return SNDRV_PCM_FMTBIT_S8;
+            else
+                return SNDRV_PCM_FMTBIT_U8;
+        DEFINE_BPC_OPTION(16, 16_)
+        DEFINE_BPC_OPTION(24, 24_)
+        DEFINE_BPC_OPTION(32, 32_)
+        default:
+            pr_err("aaudio: unsupported bits per channel: %u\n", desc->bits_per_channel);
+            return 0;
+    }
+}
+int aaudio_create_hw_info(struct aaudio_apple_description *desc, struct snd_pcm_hardware *alsa_hw,
+        size_t buf_size)
+{
+    uint rate;
+    alsa_hw->info = (SNDRV_PCM_INFO_MMAP |
+                     SNDRV_PCM_INFO_BLOCK_TRANSFER |
+                     SNDRV_PCM_INFO_MMAP_VALID |
+                     SNDRV_PCM_INFO_DOUBLE);
+    if (desc->format_flags & AAUDIO_FORMAT_FLAG_NON_MIXABLE)
+        pr_warn("aaudio: unsupported hw flag: NON_MIXABLE\n");
+    if (!(desc->format_flags & AAUDIO_FORMAT_FLAG_NON_INTERLEAVED))
+        alsa_hw->info |= SNDRV_PCM_INFO_INTERLEAVED;
+    alsa_hw->formats = aaudio_get_alsa_fmtbit(desc);
+    if (!alsa_hw->formats)
+        return -EINVAL;
+    rate = (uint) aaudio_double_to_u64(desc->sample_rate_double);
+    alsa_hw->rates = snd_pcm_rate_to_rate_bit(rate);
+    alsa_hw->rate_min = rate;
+    alsa_hw->rate_max = rate;
+    alsa_hw->channels_min = desc->channels_per_frame;
+    alsa_hw->channels_max = desc->channels_per_frame;
+    alsa_hw->buffer_bytes_max = buf_size;
+    alsa_hw->period_bytes_min = desc->bytes_per_packet;
+    alsa_hw->period_bytes_max = desc->bytes_per_packet;
+    alsa_hw->periods_min = (uint) (buf_size / desc->bytes_per_packet);
+    alsa_hw->periods_max = (uint) (buf_size / desc->bytes_per_packet);
+    pr_debug("aaudio_create_hw_info: format = %llu, rate = %u/%u. channels = %u, periods = %u, period size = %lu\n",
+            alsa_hw->formats, alsa_hw->rate_min, alsa_hw->rates, alsa_hw->channels_min, alsa_hw->periods_min,
+            alsa_hw->period_bytes_min);
+    return 0;
+}
+
+static struct aaudio_stream *aaudio_pcm_stream(struct snd_pcm_substream *substream)
+{
+    struct aaudio_subdevice *sdev = snd_pcm_substream_chip(substream);
+    if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK)
+        return &sdev->out_streams[substream->number];
+    else
+        return &sdev->in_streams[substream->number];
+}
+
+static int aaudio_pcm_open(struct snd_pcm_substream *substream)
+{
+    pr_debug("aaudio_pcm_open\n");
+    substream->runtime->hw = *aaudio_pcm_stream(substream)->alsa_hw_desc;
+
+    return 0;
+}
+
+static int aaudio_pcm_close(struct snd_pcm_substream *substream)
+{
+    pr_debug("aaudio_pcm_close\n");
+    return 0;
+}
+
+static int aaudio_pcm_prepare(struct snd_pcm_substream *substream)
+{
+    return 0;
+}
+
+static int aaudio_pcm_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *hw_params)
+{
+    struct aaudio_stream *astream = aaudio_pcm_stream(substream);
+    pr_debug("aaudio_pcm_hw_params\n");
+
+    if (!astream->buffer_cnt || !astream->buffers)
+        return -EINVAL;
+
+    substream->runtime->dma_area = astream->buffers[0].ptr;
+    substream->runtime->dma_addr = astream->buffers[0].dma_addr;
+    substream->runtime->dma_bytes = astream->buffers[0].size;
+    return 0;
+}
+
+static int aaudio_pcm_hw_free(struct snd_pcm_substream *substream)
+{
+    pr_debug("aaudio_pcm_hw_free\n");
+    return 0;
+}
+
+static void aaudio_pcm_start(struct snd_pcm_substream *substream)
+{
+    struct aaudio_subdevice *sdev = snd_pcm_substream_chip(substream);
+    struct aaudio_stream *stream = aaudio_pcm_stream(substream);
+    void *buf;
+    size_t s;
+    ktime_t time_start, time_end;
+    bool back_buffer;
+    time_start = ktime_get();
+
+    back_buffer = (substream->stream == SNDRV_PCM_STREAM_PLAYBACK);
+
+    if (back_buffer) {
+        s = frames_to_bytes(substream->runtime, substream->runtime->control->appl_ptr);
+        buf = kmalloc(s, GFP_KERNEL);
+        memcpy_fromio(buf, substream->runtime->dma_area, s);
+        time_end = ktime_get();
+        pr_debug("aaudio: Backed up the buffer in %lluns [%li]\n", ktime_to_ns(time_end - time_start),
+                substream->runtime->control->appl_ptr);
+    }
+
+    stream->waiting_for_first_ts = true;
+    stream->frame_min = stream->latency;
+
+    aaudio_cmd_start_io(sdev->a, sdev->dev_id);
+    if (back_buffer)
+        memcpy_toio(substream->runtime->dma_area, buf, s);
+
+    time_end = ktime_get();
+    pr_debug("aaudio: Started the audio device in %lluns\n", ktime_to_ns(time_end - time_start));
+}
+
+static int aaudio_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
+{
+    struct aaudio_subdevice *sdev = snd_pcm_substream_chip(substream);
+    struct aaudio_stream *stream = aaudio_pcm_stream(substream);
+    pr_debug("aaudio_pcm_trigger %x\n", cmd);
+
+    /* We only supports triggers on the #0 buffer */
+    if (substream->number != 0)
+        return 0;
+    switch (cmd) {
+        case SNDRV_PCM_TRIGGER_START:
+            aaudio_pcm_start(substream);
+            stream->started = 1;
+            break;
+        case SNDRV_PCM_TRIGGER_STOP:
+            aaudio_cmd_stop_io(sdev->a, sdev->dev_id);
+            stream->started = 0;
+            break;
+        default:
+            return -EINVAL;
+    }
+    return 0;
+}
+
+static snd_pcm_uframes_t aaudio_pcm_pointer(struct snd_pcm_substream *substream)
+{
+    struct aaudio_stream *stream = aaudio_pcm_stream(substream);
+    ktime_t time_from_start;
+    snd_pcm_sframes_t frames;
+    snd_pcm_sframes_t buffer_time_length;
+
+    if (!stream->started || stream->waiting_for_first_ts) {
+        pr_warn("aaudio_pcm_pointer while not started\n");
+        return 0;
+    }
+
+    /* Approximate the pointer based on the last received timestamp */
+    time_from_start = ktime_get_boottime() - stream->remote_timestamp;
+    buffer_time_length = NSEC_PER_SEC * substream->runtime->buffer_size / substream->runtime->rate;
+    frames = (ktime_to_ns(time_from_start) % buffer_time_length) * substream->runtime->buffer_size / buffer_time_length;
+    if (ktime_to_ns(time_from_start) < buffer_time_length) {
+        if (frames < stream->frame_min)
+            frames = stream->frame_min;
+        else
+            stream->frame_min = 0;
+    } else {
+        if (ktime_to_ns(time_from_start) < 2 * buffer_time_length)
+            stream->frame_min = frames;
+        else
+            stream->frame_min = 0; /* Heavy desync */
+    }
+    frames -= stream->latency;
+    if (frames < 0)
+        frames += ((-frames - 1) / substream->runtime->buffer_size + 1) * substream->runtime->buffer_size;
+    return (snd_pcm_uframes_t) frames;
+}
+
+static struct snd_pcm_ops aaudio_pcm_ops = {
+        .open =        aaudio_pcm_open,
+        .close =       aaudio_pcm_close,
+        .ioctl =       snd_pcm_lib_ioctl,
+        .hw_params =   aaudio_pcm_hw_params,
+        .hw_free =     aaudio_pcm_hw_free,
+        .prepare =     aaudio_pcm_prepare,
+        .trigger =     aaudio_pcm_trigger,
+        .pointer =     aaudio_pcm_pointer,
+        .mmap    =     snd_pcm_lib_mmap_iomem
+};
+
+int aaudio_create_pcm(struct aaudio_subdevice *sdev)
+{
+    struct snd_pcm *pcm;
+    struct aaudio_alsa_pcm_id_mapping *id_mapping;
+    int err;
+
+    if (!sdev->is_pcm || (sdev->in_stream_cnt == 0 && sdev->out_stream_cnt == 0)) {
+        return -EINVAL;
+    }
+
+    for (id_mapping = aaudio_alsa_id_mappings; id_mapping->name; id_mapping++) {
+        if (!strcmp(sdev->uid, id_mapping->name)) {
+            sdev->alsa_id = id_mapping->alsa_id;
+            break;
+        }
+    }
+    if (!id_mapping->name)
+        sdev->alsa_id = sdev->a->next_alsa_id++;
+    err = snd_pcm_new(sdev->a->card, sdev->uid, sdev->alsa_id,
+            (int) sdev->out_stream_cnt, (int) sdev->in_stream_cnt, &pcm);
+    if (err < 0)
+        return err;
+    pcm->private_data = sdev;
+    pcm->nonatomic = 1;
+    sdev->pcm = pcm;
+    strcpy(pcm->name, sdev->uid);
+    snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &aaudio_pcm_ops);
+    snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_CAPTURE, &aaudio_pcm_ops);
+    return 0;
+}
+
+static void aaudio_handle_stream_timestamp(struct snd_pcm_substream *substream, ktime_t timestamp)
+{
+    unsigned long flags;
+    struct aaudio_stream *stream;
+
+    stream = aaudio_pcm_stream(substream);
+    snd_pcm_stream_lock_irqsave(substream, flags);
+    stream->remote_timestamp = timestamp;
+    if (stream->waiting_for_first_ts) {
+        stream->waiting_for_first_ts = false;
+        snd_pcm_stream_unlock_irqrestore(substream, flags);
+        return;
+    }
+    snd_pcm_stream_unlock_irqrestore(substream, flags);
+    snd_pcm_period_elapsed(substream);
+}
+
+void aaudio_handle_timestamp(struct aaudio_subdevice *sdev, ktime_t os_timestamp, u64 dev_timestamp)
+{
+    struct snd_pcm_substream *substream;
+
+    substream = sdev->pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream;
+    if (substream)
+        aaudio_handle_stream_timestamp(substream, dev_timestamp);
+    substream = sdev->pcm->streams[SNDRV_PCM_STREAM_CAPTURE].substream;
+    if (substream)
+        aaudio_handle_stream_timestamp(substream, os_timestamp);
+}
diff --git a/drivers/staging/apple-bce/audio/pcm.h b/drivers/staging/apple-bce/audio/pcm.h
new file mode 100644
index 000000000000..ea5f35fbe408
--- /dev/null
+++ b/drivers/staging/apple-bce/audio/pcm.h
@@ -0,0 +1,16 @@
+#ifndef AAUDIO_PCM_H
+#define AAUDIO_PCM_H
+
+#include <linux/types.h>
+#include <linux/ktime.h>
+
+struct aaudio_subdevice;
+struct aaudio_apple_description;
+struct snd_pcm_hardware;
+
+int aaudio_create_hw_info(struct aaudio_apple_description *desc, struct snd_pcm_hardware *alsa_hw, size_t buf_size);
+int aaudio_create_pcm(struct aaudio_subdevice *sdev);
+
+void aaudio_handle_timestamp(struct aaudio_subdevice *sdev, ktime_t os_timestamp, u64 dev_timestamp);
+
+#endif //AAUDIO_PCM_H
diff --git a/drivers/staging/apple-bce/audio/protocol.c b/drivers/staging/apple-bce/audio/protocol.c
new file mode 100644
index 000000000000..2314813aeead
--- /dev/null
+++ b/drivers/staging/apple-bce/audio/protocol.c
@@ -0,0 +1,347 @@
+#include "protocol.h"
+#include "protocol_bce.h"
+#include "audio.h"
+
+int aaudio_msg_read_base(struct aaudio_msg *msg, struct aaudio_msg_base *base)
+{
+    if (msg->size < sizeof(struct aaudio_msg_header) + sizeof(struct aaudio_msg_base) * 2)
+        return -EINVAL;
+    *base = *((struct aaudio_msg_base *) ((struct aaudio_msg_header *) msg->data + 1));
+    return 0;
+}
+
+#define READ_START(type) \
+    size_t offset = sizeof(struct aaudio_msg_header) + sizeof(struct aaudio_msg_base); (void)offset; \
+    if (((struct aaudio_msg_base *) ((struct aaudio_msg_header *) msg->data + 1))->msg != type) \
+        return -EINVAL;
+#define READ_DEVID_VAR(devid) *devid = ((struct aaudio_msg_header *) msg->data)->device_id
+#define READ_VAL(type) ({ offset += sizeof(type); *((type *) ((u8 *) msg->data + offset - sizeof(type))); })
+#define READ_VAR(type, var) *var = READ_VAL(type)
+
+int aaudio_msg_read_start_io_response(struct aaudio_msg *msg)
+{
+    READ_START(AAUDIO_MSG_START_IO_RESPONSE);
+    return 0;
+}
+
+int aaudio_msg_read_stop_io_response(struct aaudio_msg *msg)
+{
+    READ_START(AAUDIO_MSG_STOP_IO_RESPONSE);
+    return 0;
+}
+
+int aaudio_msg_read_update_timestamp(struct aaudio_msg *msg, aaudio_device_id_t *devid,
+        u64 *timestamp, u64 *update_seed)
+{
+    READ_START(AAUDIO_MSG_UPDATE_TIMESTAMP);
+    READ_DEVID_VAR(devid);
+    READ_VAR(u64, timestamp);
+    READ_VAR(u64, update_seed);
+    return 0;
+}
+
+int aaudio_msg_read_get_property_response(struct aaudio_msg *msg, aaudio_object_id_t *obj,
+        struct aaudio_prop_addr *prop, void **data, u64 *data_size)
+{
+    READ_START(AAUDIO_MSG_GET_PROPERTY_RESPONSE);
+    READ_VAR(aaudio_object_id_t, obj);
+    READ_VAR(u32, &prop->element);
+    READ_VAR(u32, &prop->scope);
+    READ_VAR(u32, &prop->selector);
+    READ_VAR(u64, data_size);
+    *data = ((u8 *) msg->data + offset);
+    /* offset += data_size; */
+    return 0;
+}
+
+int aaudio_msg_read_set_property_response(struct aaudio_msg *msg, aaudio_object_id_t *obj)
+{
+    READ_START(AAUDIO_MSG_SET_PROPERTY_RESPONSE);
+    READ_VAR(aaudio_object_id_t, obj);
+    return 0;
+}
+
+int aaudio_msg_read_property_listener_response(struct aaudio_msg *msg, aaudio_object_id_t *obj,
+        struct aaudio_prop_addr *prop)
+{
+    READ_START(AAUDIO_MSG_PROPERTY_LISTENER_RESPONSE);
+    READ_VAR(aaudio_object_id_t, obj);
+    READ_VAR(u32, &prop->element);
+    READ_VAR(u32, &prop->scope);
+    READ_VAR(u32, &prop->selector);
+    return 0;
+}
+
+int aaudio_msg_read_property_changed(struct aaudio_msg *msg, aaudio_device_id_t *devid, aaudio_object_id_t *obj,
+        struct aaudio_prop_addr *prop)
+{
+    READ_START(AAUDIO_MSG_PROPERTY_CHANGED);
+    READ_DEVID_VAR(devid);
+    READ_VAR(aaudio_object_id_t, obj);
+    READ_VAR(u32, &prop->element);
+    READ_VAR(u32, &prop->scope);
+    READ_VAR(u32, &prop->selector);
+    return 0;
+}
+
+int aaudio_msg_read_set_input_stream_address_ranges_response(struct aaudio_msg *msg)
+{
+    READ_START(AAUDIO_MSG_SET_INPUT_STREAM_ADDRESS_RANGES_RESPONSE);
+    return 0;
+}
+
+int aaudio_msg_read_get_input_stream_list_response(struct aaudio_msg *msg, aaudio_object_id_t **str_l, u64 *str_cnt)
+{
+    READ_START(AAUDIO_MSG_GET_INPUT_STREAM_LIST_RESPONSE);
+    READ_VAR(u64, str_cnt);
+    *str_l = (aaudio_device_id_t *) ((u8 *) msg->data + offset);
+    /* offset += str_cnt * sizeof(aaudio_object_id_t); */
+    return 0;
+}
+
+int aaudio_msg_read_get_output_stream_list_response(struct aaudio_msg *msg, aaudio_object_id_t **str_l, u64 *str_cnt)
+{
+    READ_START(AAUDIO_MSG_GET_OUTPUT_STREAM_LIST_RESPONSE);
+    READ_VAR(u64, str_cnt);
+    *str_l = (aaudio_device_id_t *) ((u8 *) msg->data + offset);
+    /* offset += str_cnt * sizeof(aaudio_object_id_t); */
+    return 0;
+}
+
+int aaudio_msg_read_set_remote_access_response(struct aaudio_msg *msg)
+{
+    READ_START(AAUDIO_MSG_SET_REMOTE_ACCESS_RESPONSE);
+    return 0;
+}
+
+int aaudio_msg_read_get_device_list_response(struct aaudio_msg *msg, aaudio_device_id_t **dev_l, u64 *dev_cnt)
+{
+    READ_START(AAUDIO_MSG_GET_DEVICE_LIST_RESPONSE);
+    READ_VAR(u64, dev_cnt);
+    *dev_l = (aaudio_device_id_t *) ((u8 *) msg->data + offset);
+    /* offset += dev_cnt * sizeof(aaudio_device_id_t); */
+    return 0;
+}
+
+#define WRITE_START_OF_TYPE(typev, devid) \
+    size_t offset = sizeof(struct aaudio_msg_header); (void) offset; \
+    ((struct aaudio_msg_header *) msg->data)->type = (typev); \
+    ((struct aaudio_msg_header *) msg->data)->device_id = (devid);
+#define WRITE_START_COMMAND(devid) WRITE_START_OF_TYPE(AAUDIO_MSG_TYPE_COMMAND, devid)
+#define WRITE_START_RESPONSE() WRITE_START_OF_TYPE(AAUDIO_MSG_TYPE_RESPONSE, 0)
+#define WRITE_START_NOTIFICATION() WRITE_START_OF_TYPE(AAUDIO_MSG_TYPE_NOTIFICATION, 0)
+#define WRITE_VAL(type, value) { *((type *) ((u8 *) msg->data + offset)) = value; offset += sizeof(value); }
+#define WRITE_BIN(value, size) { memcpy((u8 *) msg->data + offset, value, size); offset += size; }
+#define WRITE_BASE(type) WRITE_VAL(u32, type) WRITE_VAL(u32, 0)
+#define WRITE_END() { msg->size = offset; }
+
+void aaudio_msg_write_start_io(struct aaudio_msg *msg, aaudio_device_id_t dev)
+{
+    WRITE_START_COMMAND(dev);
+    WRITE_BASE(AAUDIO_MSG_START_IO);
+    WRITE_END();
+}
+
+void aaudio_msg_write_stop_io(struct aaudio_msg *msg, aaudio_device_id_t dev)
+{
+    WRITE_START_COMMAND(dev);
+    WRITE_BASE(AAUDIO_MSG_STOP_IO);
+    WRITE_END();
+}
+
+void aaudio_msg_write_get_property(struct aaudio_msg *msg, aaudio_device_id_t dev, aaudio_object_id_t obj,
+        struct aaudio_prop_addr prop, void *qualifier, u64 qualifier_size)
+{
+    WRITE_START_COMMAND(dev);
+    WRITE_BASE(AAUDIO_MSG_GET_PROPERTY);
+    WRITE_VAL(aaudio_object_id_t, obj);
+    WRITE_VAL(u32, prop.element);
+    WRITE_VAL(u32, prop.scope);
+    WRITE_VAL(u32, prop.selector);
+    WRITE_VAL(u64, qualifier_size);
+    WRITE_BIN(qualifier, qualifier_size);
+    WRITE_END();
+}
+
+void aaudio_msg_write_set_property(struct aaudio_msg *msg, aaudio_device_id_t dev, aaudio_object_id_t obj,
+        struct aaudio_prop_addr prop, void *data, u64 data_size, void *qualifier, u64 qualifier_size)
+{
+    WRITE_START_COMMAND(dev);
+    WRITE_BASE(AAUDIO_MSG_SET_PROPERTY);
+    WRITE_VAL(aaudio_object_id_t, obj);
+    WRITE_VAL(u32, prop.element);
+    WRITE_VAL(u32, prop.scope);
+    WRITE_VAL(u32, prop.selector);
+    WRITE_VAL(u64, data_size);
+    WRITE_BIN(data, data_size);
+    WRITE_VAL(u64, qualifier_size);
+    WRITE_BIN(qualifier, qualifier_size);
+    WRITE_END();
+}
+
+void aaudio_msg_write_property_listener(struct aaudio_msg *msg, aaudio_device_id_t dev, aaudio_object_id_t obj,
+        struct aaudio_prop_addr prop)
+{
+    WRITE_START_COMMAND(dev);
+    WRITE_BASE(AAUDIO_MSG_PROPERTY_LISTENER);
+    WRITE_VAL(aaudio_object_id_t, obj);
+    WRITE_VAL(u32, prop.element);
+    WRITE_VAL(u32, prop.scope);
+    WRITE_VAL(u32, prop.selector);
+    WRITE_END();
+}
+
+void aaudio_msg_write_set_input_stream_address_ranges(struct aaudio_msg *msg, aaudio_device_id_t devid)
+{
+    WRITE_START_COMMAND(devid);
+    WRITE_BASE(AAUDIO_MSG_SET_INPUT_STREAM_ADDRESS_RANGES);
+    WRITE_END();
+}
+
+void aaudio_msg_write_get_input_stream_list(struct aaudio_msg *msg, aaudio_device_id_t devid)
+{
+    WRITE_START_COMMAND(devid);
+    WRITE_BASE(AAUDIO_MSG_GET_INPUT_STREAM_LIST);
+    WRITE_END();
+}
+
+void aaudio_msg_write_get_output_stream_list(struct aaudio_msg *msg, aaudio_device_id_t devid)
+{
+    WRITE_START_COMMAND(devid);
+    WRITE_BASE(AAUDIO_MSG_GET_OUTPUT_STREAM_LIST);
+    WRITE_END();
+}
+
+void aaudio_msg_write_set_remote_access(struct aaudio_msg *msg, u64 mode)
+{
+    WRITE_START_COMMAND(0);
+    WRITE_BASE(AAUDIO_MSG_SET_REMOTE_ACCESS);
+    WRITE_VAL(u64, mode);
+    WRITE_END();
+}
+
+void aaudio_msg_write_alive_notification(struct aaudio_msg *msg, u32 proto_ver, u32 msg_ver)
+{
+    WRITE_START_NOTIFICATION();
+    WRITE_BASE(AAUDIO_MSG_NOTIFICATION_ALIVE);
+    WRITE_VAL(u32, proto_ver);
+    WRITE_VAL(u32, msg_ver);
+    WRITE_END();
+}
+
+void aaudio_msg_write_update_timestamp_response(struct aaudio_msg *msg)
+{
+    WRITE_START_RESPONSE();
+    WRITE_BASE(AAUDIO_MSG_UPDATE_TIMESTAMP_RESPONSE);
+    WRITE_END();
+}
+
+void aaudio_msg_write_get_device_list(struct aaudio_msg *msg)
+{
+    WRITE_START_COMMAND(0);
+    WRITE_BASE(AAUDIO_MSG_GET_DEVICE_LIST);
+    WRITE_END();
+}
+
+#define CMD_SHARED_VARS_NO_REPLY \
+    int status = 0; \
+    struct aaudio_send_ctx sctx;
+#define CMD_SHARED_VARS \
+    CMD_SHARED_VARS_NO_REPLY \
+    struct aaudio_msg reply = aaudio_reply_alloc(); \
+    struct aaudio_msg *buf = &reply;
+#define CMD_SEND_REQUEST(fn, ...) \
+    if ((status = aaudio_send_cmd_sync(a, &sctx, buf, 500, fn, ##__VA_ARGS__))) \
+        return status;
+#define CMD_DEF_SHARED_AND_SEND(fn, ...) \
+    CMD_SHARED_VARS \
+    CMD_SEND_REQUEST(fn, ##__VA_ARGS__);
+#define CMD_DEF_SHARED_NO_REPLY_AND_SEND(fn, ...) \
+    CMD_SHARED_VARS_NO_REPLY \
+    CMD_SEND_REQUEST(fn, ##__VA_ARGS__);
+#define CMD_HNDL_REPLY_NO_FREE(fn, ...) \
+    status = fn(buf, ##__VA_ARGS__); \
+    return status;
+#define CMD_HNDL_REPLY_AND_FREE(fn, ...) \
+    status = fn(buf, ##__VA_ARGS__); \
+    aaudio_reply_free(&reply); \
+    return status;
+
+int aaudio_cmd_start_io(struct aaudio_device *a, aaudio_device_id_t devid)
+{
+    CMD_DEF_SHARED_AND_SEND(aaudio_msg_write_start_io, devid);
+    CMD_HNDL_REPLY_AND_FREE(aaudio_msg_read_start_io_response);
+}
+int aaudio_cmd_stop_io(struct aaudio_device *a, aaudio_device_id_t devid)
+{
+    CMD_DEF_SHARED_AND_SEND(aaudio_msg_write_stop_io, devid);
+    CMD_HNDL_REPLY_AND_FREE(aaudio_msg_read_stop_io_response);
+}
+int aaudio_cmd_get_property(struct aaudio_device *a, struct aaudio_msg *buf,
+        aaudio_device_id_t devid, aaudio_object_id_t obj,
+        struct aaudio_prop_addr prop, void *qualifier, u64 qualifier_size, void **data, u64 *data_size)
+{
+    CMD_DEF_SHARED_NO_REPLY_AND_SEND(aaudio_msg_write_get_property, devid, obj, prop, qualifier, qualifier_size);
+    CMD_HNDL_REPLY_NO_FREE(aaudio_msg_read_get_property_response, &obj, &prop, data, data_size);
+}
+int aaudio_cmd_get_primitive_property(struct aaudio_device *a,
+        aaudio_device_id_t devid, aaudio_object_id_t obj,
+        struct aaudio_prop_addr prop, void *qualifier, u64 qualifier_size, void *data, u64 data_size)
+{
+    int status;
+    struct aaudio_msg reply = aaudio_reply_alloc();
+    void *r_data;
+    u64 r_data_size;
+    if ((status = aaudio_cmd_get_property(a, &reply, devid, obj, prop, qualifier, qualifier_size,
+            &r_data, &r_data_size)))
+        goto finish;
+    if (r_data_size != data_size) {
+        status = -EINVAL;
+        goto finish;
+    }
+    memcpy(data, r_data, data_size);
+finish:
+    aaudio_reply_free(&reply);
+    return status;
+}
+int aaudio_cmd_set_property(struct aaudio_device *a, aaudio_device_id_t devid, aaudio_object_id_t obj,
+        struct aaudio_prop_addr prop, void *qualifier, u64 qualifier_size, void *data, u64 data_size)
+{
+    CMD_DEF_SHARED_AND_SEND(aaudio_msg_write_set_property, devid, obj, prop, data, data_size,
+            qualifier, qualifier_size);
+    CMD_HNDL_REPLY_AND_FREE(aaudio_msg_read_set_property_response, &obj);
+}
+int aaudio_cmd_property_listener(struct aaudio_device *a, aaudio_device_id_t devid, aaudio_object_id_t obj,
+        struct aaudio_prop_addr prop)
+{
+    CMD_DEF_SHARED_AND_SEND(aaudio_msg_write_property_listener, devid, obj, prop);
+    CMD_HNDL_REPLY_AND_FREE(aaudio_msg_read_property_listener_response, &obj, &prop);
+}
+int aaudio_cmd_set_input_stream_address_ranges(struct aaudio_device *a, aaudio_device_id_t devid)
+{
+    CMD_DEF_SHARED_AND_SEND(aaudio_msg_write_set_input_stream_address_ranges, devid);
+    CMD_HNDL_REPLY_AND_FREE(aaudio_msg_read_set_input_stream_address_ranges_response);
+}
+int aaudio_cmd_get_input_stream_list(struct aaudio_device *a, struct aaudio_msg *buf, aaudio_device_id_t devid,
+        aaudio_object_id_t **str_l, u64 *str_cnt)
+{
+    CMD_DEF_SHARED_NO_REPLY_AND_SEND(aaudio_msg_write_get_input_stream_list, devid);
+    CMD_HNDL_REPLY_NO_FREE(aaudio_msg_read_get_input_stream_list_response, str_l, str_cnt);
+}
+int aaudio_cmd_get_output_stream_list(struct aaudio_device *a, struct aaudio_msg *buf, aaudio_device_id_t devid,
+        aaudio_object_id_t **str_l, u64 *str_cnt)
+{
+    CMD_DEF_SHARED_NO_REPLY_AND_SEND(aaudio_msg_write_get_output_stream_list, devid);
+    CMD_HNDL_REPLY_NO_FREE(aaudio_msg_read_get_output_stream_list_response, str_l, str_cnt);
+}
+int aaudio_cmd_set_remote_access(struct aaudio_device *a, u64 mode)
+{
+    CMD_DEF_SHARED_AND_SEND(aaudio_msg_write_set_remote_access, mode);
+    CMD_HNDL_REPLY_AND_FREE(aaudio_msg_read_set_remote_access_response);
+}
+int aaudio_cmd_get_device_list(struct aaudio_device *a, struct aaudio_msg *buf,
+        aaudio_device_id_t **dev_l, u64 *dev_cnt)
+{
+    CMD_DEF_SHARED_NO_REPLY_AND_SEND(aaudio_msg_write_get_device_list);
+    CMD_HNDL_REPLY_NO_FREE(aaudio_msg_read_get_device_list_response, dev_l, dev_cnt);
+}
\ No newline at end of file
diff --git a/drivers/staging/apple-bce/audio/protocol.h b/drivers/staging/apple-bce/audio/protocol.h
new file mode 100644
index 000000000000..3427486f3f57
--- /dev/null
+++ b/drivers/staging/apple-bce/audio/protocol.h
@@ -0,0 +1,147 @@
+#ifndef AAUDIO_PROTOCOL_H
+#define AAUDIO_PROTOCOL_H
+
+#include <linux/types.h>
+
+struct aaudio_device;
+
+typedef u64 aaudio_device_id_t;
+typedef u64 aaudio_object_id_t;
+
+struct aaudio_msg {
+    void *data;
+    size_t size;
+};
+
+struct __attribute__((packed)) aaudio_msg_header {
+    char tag[4];
+    u8 type;
+    aaudio_device_id_t device_id; // Idk, use zero for commands?
+};
+struct __attribute__((packed)) aaudio_msg_base {
+    u32 msg;
+    u32 status;
+};
+
+struct aaudio_prop_addr {
+    u32 scope;
+    u32 selector;
+    u32 element;
+};
+#define AAUDIO_PROP(scope, sel, el) (struct aaudio_prop_addr) { scope, sel, el }
+
+enum {
+    AAUDIO_MSG_TYPE_COMMAND = 1,
+    AAUDIO_MSG_TYPE_RESPONSE = 2,
+    AAUDIO_MSG_TYPE_NOTIFICATION = 3
+};
+
+enum {
+    AAUDIO_MSG_START_IO = 0,
+    AAUDIO_MSG_START_IO_RESPONSE = 1,
+    AAUDIO_MSG_STOP_IO = 2,
+    AAUDIO_MSG_STOP_IO_RESPONSE = 3,
+    AAUDIO_MSG_UPDATE_TIMESTAMP = 4,
+    AAUDIO_MSG_GET_PROPERTY = 7,
+    AAUDIO_MSG_GET_PROPERTY_RESPONSE = 8,
+    AAUDIO_MSG_SET_PROPERTY = 9,
+    AAUDIO_MSG_SET_PROPERTY_RESPONSE = 10,
+    AAUDIO_MSG_PROPERTY_LISTENER = 11,
+    AAUDIO_MSG_PROPERTY_LISTENER_RESPONSE = 12,
+    AAUDIO_MSG_PROPERTY_CHANGED = 13,
+    AAUDIO_MSG_SET_INPUT_STREAM_ADDRESS_RANGES = 18,
+    AAUDIO_MSG_SET_INPUT_STREAM_ADDRESS_RANGES_RESPONSE = 19,
+    AAUDIO_MSG_GET_INPUT_STREAM_LIST = 24,
+    AAUDIO_MSG_GET_INPUT_STREAM_LIST_RESPONSE = 25,
+    AAUDIO_MSG_GET_OUTPUT_STREAM_LIST = 26,
+    AAUDIO_MSG_GET_OUTPUT_STREAM_LIST_RESPONSE = 27,
+    AAUDIO_MSG_SET_REMOTE_ACCESS = 32,
+    AAUDIO_MSG_SET_REMOTE_ACCESS_RESPONSE = 33,
+    AAUDIO_MSG_UPDATE_TIMESTAMP_RESPONSE = 34,
+
+    AAUDIO_MSG_NOTIFICATION_ALIVE = 100,
+    AAUDIO_MSG_GET_DEVICE_LIST = 101,
+    AAUDIO_MSG_GET_DEVICE_LIST_RESPONSE = 102,
+    AAUDIO_MSG_NOTIFICATION_BOOT = 104
+};
+
+enum {
+    AAUDIO_REMOTE_ACCESS_OFF = 0,
+    AAUDIO_REMOTE_ACCESS_ON = 2
+};
+
+enum {
+    AAUDIO_PROP_SCOPE_GLOBAL = 0x676c6f62, // 'glob'
+    AAUDIO_PROP_SCOPE_INPUT  = 0x696e7074, // 'inpt'
+    AAUDIO_PROP_SCOPE_OUTPUT = 0x6f757470  // 'outp'
+};
+
+enum {
+    AAUDIO_PROP_UID          = 0x75696420, // 'uid '
+    AAUDIO_PROP_BOOL_VALUE   = 0x6263766c, // 'bcvl'
+    AAUDIO_PROP_JACK_PLUGGED = 0x6a61636b, // 'jack'
+    AAUDIO_PROP_SEL_VOLUME   = 0x64656176, // 'deav'
+    AAUDIO_PROP_LATENCY      = 0x6c746e63, // 'ltnc'
+    AAUDIO_PROP_PHYS_FORMAT  = 0x70667420  // 'pft '
+};
+
+int aaudio_msg_read_base(struct aaudio_msg *msg, struct aaudio_msg_base *base);
+
+int aaudio_msg_read_start_io_response(struct aaudio_msg *msg);
+int aaudio_msg_read_stop_io_response(struct aaudio_msg *msg);
+int aaudio_msg_read_update_timestamp(struct aaudio_msg *msg, aaudio_device_id_t *devid,
+        u64 *timestamp, u64 *update_seed);
+int aaudio_msg_read_get_property_response(struct aaudio_msg *msg, aaudio_object_id_t *obj,
+        struct aaudio_prop_addr *prop, void **data, u64 *data_size);
+int aaudio_msg_read_set_property_response(struct aaudio_msg *msg, aaudio_object_id_t *obj);
+int aaudio_msg_read_property_listener_response(struct aaudio_msg *msg,aaudio_object_id_t *obj,
+        struct aaudio_prop_addr *prop);
+int aaudio_msg_read_property_changed(struct aaudio_msg *msg, aaudio_device_id_t *devid, aaudio_object_id_t *obj,
+        struct aaudio_prop_addr *prop);
+int aaudio_msg_read_set_input_stream_address_ranges_response(struct aaudio_msg *msg);
+int aaudio_msg_read_get_input_stream_list_response(struct aaudio_msg *msg, aaudio_object_id_t **str_l, u64 *str_cnt);
+int aaudio_msg_read_get_output_stream_list_response(struct aaudio_msg *msg, aaudio_object_id_t **str_l, u64 *str_cnt);
+int aaudio_msg_read_set_remote_access_response(struct aaudio_msg *msg);
+int aaudio_msg_read_get_device_list_response(struct aaudio_msg *msg, aaudio_device_id_t **dev_l, u64 *dev_cnt);
+
+void aaudio_msg_write_start_io(struct aaudio_msg *msg, aaudio_device_id_t dev);
+void aaudio_msg_write_stop_io(struct aaudio_msg *msg, aaudio_device_id_t dev);
+void aaudio_msg_write_get_property(struct aaudio_msg *msg, aaudio_device_id_t dev, aaudio_object_id_t obj,
+        struct aaudio_prop_addr prop, void *qualifier, u64 qualifier_size);
+void aaudio_msg_write_set_property(struct aaudio_msg *msg, aaudio_device_id_t dev, aaudio_object_id_t obj,
+        struct aaudio_prop_addr prop, void *data, u64 data_size, void *qualifier, u64 qualifier_size);
+void aaudio_msg_write_property_listener(struct aaudio_msg *msg, aaudio_device_id_t dev, aaudio_object_id_t obj,
+        struct aaudio_prop_addr prop);
+void aaudio_msg_write_set_input_stream_address_ranges(struct aaudio_msg *msg, aaudio_device_id_t devid);
+void aaudio_msg_write_get_input_stream_list(struct aaudio_msg *msg, aaudio_device_id_t devid);
+void aaudio_msg_write_get_output_stream_list(struct aaudio_msg *msg, aaudio_device_id_t devid);
+void aaudio_msg_write_set_remote_access(struct aaudio_msg *msg, u64 mode);
+void aaudio_msg_write_alive_notification(struct aaudio_msg *msg, u32 proto_ver, u32 msg_ver);
+void aaudio_msg_write_update_timestamp_response(struct aaudio_msg *msg);
+void aaudio_msg_write_get_device_list(struct aaudio_msg *msg);
+
+
+int aaudio_cmd_start_io(struct aaudio_device *a, aaudio_device_id_t devid);
+int aaudio_cmd_stop_io(struct aaudio_device *a, aaudio_device_id_t devid);
+int aaudio_cmd_get_property(struct aaudio_device *a, struct aaudio_msg *buf,
+        aaudio_device_id_t devid, aaudio_object_id_t obj,
+        struct aaudio_prop_addr prop, void *qualifier, u64 qualifier_size, void **data, u64 *data_size);
+int aaudio_cmd_get_primitive_property(struct aaudio_device *a,
+        aaudio_device_id_t devid, aaudio_object_id_t obj,
+        struct aaudio_prop_addr prop, void *qualifier, u64 qualifier_size, void *data, u64 data_size);
+int aaudio_cmd_set_property(struct aaudio_device *a, aaudio_device_id_t devid, aaudio_object_id_t obj,
+        struct aaudio_prop_addr prop, void *qualifier, u64 qualifier_size, void *data, u64 data_size);
+int aaudio_cmd_property_listener(struct aaudio_device *a, aaudio_device_id_t devid, aaudio_object_id_t obj,
+        struct aaudio_prop_addr prop);
+int aaudio_cmd_set_input_stream_address_ranges(struct aaudio_device *a, aaudio_device_id_t devid);
+int aaudio_cmd_get_input_stream_list(struct aaudio_device *a, struct aaudio_msg *buf, aaudio_device_id_t devid,
+        aaudio_object_id_t **str_l, u64 *str_cnt);
+int aaudio_cmd_get_output_stream_list(struct aaudio_device *a, struct aaudio_msg *buf, aaudio_device_id_t devid,
+        aaudio_object_id_t **str_l, u64 *str_cnt);
+int aaudio_cmd_set_remote_access(struct aaudio_device *a, u64 mode);
+int aaudio_cmd_get_device_list(struct aaudio_device *a, struct aaudio_msg *buf,
+        aaudio_device_id_t **dev_l, u64 *dev_cnt);
+
+
+
+#endif //AAUDIO_PROTOCOL_H
diff --git a/drivers/staging/apple-bce/audio/protocol_bce.c b/drivers/staging/apple-bce/audio/protocol_bce.c
new file mode 100644
index 000000000000..28f2dfd44d67
--- /dev/null
+++ b/drivers/staging/apple-bce/audio/protocol_bce.c
@@ -0,0 +1,226 @@
+#include "protocol_bce.h"
+
+#include "audio.h"
+
+static void aaudio_bce_out_queue_completion(struct bce_queue_sq *sq);
+static void aaudio_bce_in_queue_completion(struct bce_queue_sq *sq);
+static int aaudio_bce_queue_init(struct aaudio_device *dev, struct aaudio_bce_queue *q, const char *name, int direction,
+                                 bce_sq_completion cfn);
+void aaudio_bce_in_queue_submit_pending(struct aaudio_bce_queue *q, size_t count);
+
+int aaudio_bce_init(struct aaudio_device *dev)
+{
+    int status;
+    struct aaudio_bce *bce = &dev->bcem;
+    bce->cq = bce_create_cq(dev->bce, 0x80);
+    spin_lock_init(&bce->spinlock);
+    if (!bce->cq)
+        return -EINVAL;
+    if ((status = aaudio_bce_queue_init(dev, &bce->qout, "com.apple.BridgeAudio.IntelToARM", DMA_TO_DEVICE,
+            aaudio_bce_out_queue_completion))) {
+        return status;
+    }
+    if ((status = aaudio_bce_queue_init(dev, &bce->qin, "com.apple.BridgeAudio.ARMToIntel", DMA_FROM_DEVICE,
+            aaudio_bce_in_queue_completion))) {
+        return status;
+    }
+    aaudio_bce_in_queue_submit_pending(&bce->qin, bce->qin.el_count);
+    return 0;
+}
+
+int aaudio_bce_queue_init(struct aaudio_device *dev, struct aaudio_bce_queue *q, const char *name, int direction,
+        bce_sq_completion cfn)
+{
+    q->cq = dev->bcem.cq;
+    q->el_size = AAUDIO_BCE_QUEUE_ELEMENT_SIZE;
+    q->el_count = AAUDIO_BCE_QUEUE_ELEMENT_COUNT;
+    /* NOTE: The Apple impl uses 0x80 as the queue size, however we use 21 (in fact 20) to simplify the impl */
+    q->sq = bce_create_sq(dev->bce, q->cq, name, (u32) (q->el_count + 1), direction, cfn, dev);
+    if (!q->sq)
+        return -EINVAL;
+
+    q->data = dma_alloc_coherent(&dev->bce->pci->dev, q->el_size * q->el_count, &q->dma_addr, GFP_KERNEL);
+    if (!q->data) {
+        bce_destroy_sq(dev->bce, q->sq);
+        return -EINVAL;
+    }
+    return 0;
+}
+
+static void aaudio_send_create_tag(struct aaudio_bce *b, int *tagn, char tag[4])
+{
+    char tag_zero[5];
+    b->tag_num = (b->tag_num + 1) % AAUDIO_BCE_QUEUE_TAG_COUNT;
+    *tagn = b->tag_num;
+    snprintf(tag_zero, 5, "S%03d", b->tag_num);
+    *((u32 *) tag) = *((u32 *) tag_zero);
+}
+
+int __aaudio_send_prepare(struct aaudio_bce *b, struct aaudio_send_ctx *ctx, char *tag)
+{
+    int status;
+    size_t index;
+    void *dptr;
+    struct aaudio_msg_header *header;
+    if ((status = bce_reserve_submission(b->qout.sq, &ctx->timeout)))
+        return status;
+    spin_lock_irqsave(&b->spinlock, ctx->irq_flags);
+    index = b->qout.data_tail;
+    dptr = (u8 *) b->qout.data + index * b->qout.el_size;
+    ctx->msg.data = dptr;
+    header = dptr;
+    if (tag)
+        *((u32 *) header->tag) = *((u32 *) tag);
+    else
+        aaudio_send_create_tag(b, &ctx->tag_n, header->tag);
+    return 0;
+}
+
+void __aaudio_send(struct aaudio_bce *b, struct aaudio_send_ctx *ctx)
+{
+    struct bce_qe_submission *s = bce_next_submission(b->qout.sq);
+#ifdef DEBUG
+    pr_debug("aaudio: Sending command data\n");
+    print_hex_dump(KERN_DEBUG, "aaudio:OUT ", DUMP_PREFIX_NONE, 32, 1, ctx->msg.data, ctx->msg.size, true);
+#endif
+    bce_set_submission_single(s, b->qout.dma_addr + (dma_addr_t) (ctx->msg.data - b->qout.data), ctx->msg.size);
+    bce_submit_to_device(b->qout.sq);
+    b->qout.data_tail = (b->qout.data_tail + 1) % b->qout.el_count;
+    spin_unlock_irqrestore(&b->spinlock, ctx->irq_flags);
+}
+
+int __aaudio_send_cmd_sync(struct aaudio_bce *b, struct aaudio_send_ctx *ctx, struct aaudio_msg *reply)
+{
+    struct aaudio_bce_queue_entry ent;
+    DECLARE_COMPLETION_ONSTACK(cmpl);
+    ent.msg = reply;
+    ent.cmpl = &cmpl;
+    b->pending_entries[ctx->tag_n] = &ent;
+    __aaudio_send(b, ctx); /* unlocks the spinlock */
+    ctx->timeout = wait_for_completion_timeout(&cmpl, ctx->timeout);
+    if (ctx->timeout == 0) {
+        /* Remove the pending queue entry; this will be normally handled by the completion route but
+         * during a timeout it won't */
+        spin_lock_irqsave(&b->spinlock, ctx->irq_flags);
+        if (b->pending_entries[ctx->tag_n] == &ent)
+            b->pending_entries[ctx->tag_n] = NULL;
+        spin_unlock_irqrestore(&b->spinlock, ctx->irq_flags);
+        return -ETIMEDOUT;
+    }
+    return 0;
+}
+
+static void aaudio_handle_reply(struct aaudio_bce *b, struct aaudio_msg *reply)
+{
+    const char *tag;
+    int tagn;
+    unsigned long irq_flags;
+    char tag_zero[5];
+    struct aaudio_bce_queue_entry *entry;
+
+    tag = ((struct aaudio_msg_header *) reply->data)->tag;
+    if (tag[0] != 'S') {
+        pr_err("aaudio_handle_reply: Unexpected tag: %.4s\n", tag);
+        return;
+    }
+    *((u32 *) tag_zero) = *((u32 *) tag);
+    tag_zero[4] = 0;
+    if (kstrtoint(&tag_zero[1], 10, &tagn)) {
+        pr_err("aaudio_handle_reply: Tag parse failed: %.4s\n", tag);
+        return;
+    }
+
+    spin_lock_irqsave(&b->spinlock, irq_flags);
+    entry = b->pending_entries[tagn];
+    if (entry) {
+        if (reply->size < entry->msg->size)
+            entry->msg->size = reply->size;
+        memcpy(entry->msg->data, reply->data, entry->msg->size);
+        complete(entry->cmpl);
+
+        b->pending_entries[tagn] = NULL;
+    } else {
+        pr_err("aaudio_handle_reply: No queued item found for tag: %.4s\n", tag);
+    }
+    spin_unlock_irqrestore(&b->spinlock, irq_flags);
+}
+
+static void aaudio_bce_out_queue_completion(struct bce_queue_sq *sq)
+{
+    while (bce_next_completion(sq)) {
+        //pr_info("aaudio: Send confirmed\n");
+        bce_notify_submission_complete(sq);
+    }
+}
+
+static void aaudio_bce_in_queue_handle_msg(struct aaudio_device *a, struct aaudio_msg *msg);
+
+static void aaudio_bce_in_queue_completion(struct bce_queue_sq *sq)
+{
+    struct aaudio_msg msg;
+    struct aaudio_device *dev = sq->userdata;
+    struct aaudio_bce_queue *q = &dev->bcem.qin;
+    struct bce_sq_completion_data *c;
+    size_t cnt = 0;
+
+    mb();
+    while ((c = bce_next_completion(sq))) {
+        msg.data = (u8 *) q->data + q->data_head * q->el_size;
+        msg.size = c->data_size;
+#ifdef DEBUG
+        pr_debug("aaudio: Received command data %llx\n", c->data_size);
+        print_hex_dump(KERN_DEBUG, "aaudio:IN ", DUMP_PREFIX_NONE, 32, 1, msg.data, min(msg.size, 128UL), true);
+#endif
+        aaudio_bce_in_queue_handle_msg(dev, &msg);
+
+        q->data_head = (q->data_head + 1) % q->el_count;
+
+        bce_notify_submission_complete(sq);
+        ++cnt;
+    }
+    aaudio_bce_in_queue_submit_pending(q, cnt);
+}
+
+static void aaudio_bce_in_queue_handle_msg(struct aaudio_device *a, struct aaudio_msg *msg)
+{
+    struct aaudio_msg_header *header = (struct aaudio_msg_header *) msg->data;
+    if (msg->size < sizeof(struct aaudio_msg_header)) {
+        pr_err("aaudio: Msg size smaller than header (%lx)", msg->size);
+        return;
+    }
+    if (header->type == AAUDIO_MSG_TYPE_RESPONSE) {
+        aaudio_handle_reply(&a->bcem, msg);
+    } else if (header->type == AAUDIO_MSG_TYPE_COMMAND) {
+        aaudio_handle_command(a, msg);
+    } else if (header->type == AAUDIO_MSG_TYPE_NOTIFICATION) {
+        aaudio_handle_notification(a, msg);
+    }
+}
+
+void aaudio_bce_in_queue_submit_pending(struct aaudio_bce_queue *q, size_t count)
+{
+    struct bce_qe_submission *s;
+    while (count--) {
+        if (bce_reserve_submission(q->sq, NULL)) {
+            pr_err("aaudio: Failed to reserve an event queue submission\n");
+            break;
+        }
+        s = bce_next_submission(q->sq);
+        bce_set_submission_single(s, q->dma_addr + (dma_addr_t) (q->data_tail * q->el_size), q->el_size);
+        q->data_tail = (q->data_tail + 1) % q->el_count;
+    }
+    bce_submit_to_device(q->sq);
+}
+
+struct aaudio_msg aaudio_reply_alloc(void)
+{
+    struct aaudio_msg ret;
+    ret.size = AAUDIO_BCE_QUEUE_ELEMENT_SIZE;
+    ret.data = kmalloc(ret.size, GFP_KERNEL);
+    return ret;
+}
+
+void aaudio_reply_free(struct aaudio_msg *reply)
+{
+    kfree(reply->data);
+}
diff --git a/drivers/staging/apple-bce/audio/protocol_bce.h b/drivers/staging/apple-bce/audio/protocol_bce.h
new file mode 100644
index 000000000000..14d26c05ddf9
--- /dev/null
+++ b/drivers/staging/apple-bce/audio/protocol_bce.h
@@ -0,0 +1,72 @@
+#ifndef AAUDIO_PROTOCOL_BCE_H
+#define AAUDIO_PROTOCOL_BCE_H
+
+#include "protocol.h"
+#include "../queue.h"
+
+#define AAUDIO_BCE_QUEUE_ELEMENT_SIZE 0x1000
+#define AAUDIO_BCE_QUEUE_ELEMENT_COUNT 20
+
+#define AAUDIO_BCE_QUEUE_TAG_COUNT 1000
+
+struct aaudio_device;
+
+struct aaudio_bce_queue_entry {
+    struct aaudio_msg *msg;
+    struct completion *cmpl;
+};
+struct aaudio_bce_queue {
+    struct bce_queue_cq *cq;
+    struct bce_queue_sq *sq;
+    void *data;
+    dma_addr_t dma_addr;
+    size_t data_head, data_tail;
+    size_t el_size, el_count;
+};
+struct aaudio_bce {
+    struct bce_queue_cq *cq;
+    struct aaudio_bce_queue qin;
+    struct aaudio_bce_queue qout;
+    int tag_num;
+    struct aaudio_bce_queue_entry *pending_entries[AAUDIO_BCE_QUEUE_TAG_COUNT];
+    struct spinlock spinlock;
+};
+
+struct aaudio_send_ctx {
+    int status;
+    int tag_n;
+    unsigned long irq_flags;
+    struct aaudio_msg msg;
+    unsigned long timeout;
+};
+
+int aaudio_bce_init(struct aaudio_device *dev);
+int __aaudio_send_prepare(struct aaudio_bce *b, struct aaudio_send_ctx *ctx, char *tag);
+void __aaudio_send(struct aaudio_bce *b, struct aaudio_send_ctx *ctx);
+int __aaudio_send_cmd_sync(struct aaudio_bce *b, struct aaudio_send_ctx *ctx, struct aaudio_msg *reply);
+
+#define aaudio_send_with_tag(a, ctx, tag, tout, fn, ...) ({ \
+    (ctx)->timeout = msecs_to_jiffies(tout); \
+    (ctx)->status = __aaudio_send_prepare(&(a)->bcem, (ctx), (tag)); \
+    if (!(ctx)->status) { \
+        fn(&(ctx)->msg, ##__VA_ARGS__); \
+        __aaudio_send(&(a)->bcem, (ctx)); \
+    } \
+    (ctx)->status; \
+})
+#define aaudio_send(a, ctx, tout, fn, ...) aaudio_send_with_tag(a, ctx, NULL, tout, fn, ##__VA_ARGS__)
+
+#define aaudio_send_cmd_sync(a, ctx, reply, tout, fn, ...) ({ \
+    (ctx)->timeout = msecs_to_jiffies(tout); \
+    (ctx)->status = __aaudio_send_prepare(&(a)->bcem, (ctx), NULL); \
+    if (!(ctx)->status) { \
+        fn(&(ctx)->msg, ##__VA_ARGS__); \
+        (ctx)->status = __aaudio_send_cmd_sync(&(a)->bcem, (ctx), (reply)); \
+    } \
+    (ctx)->status; \
+})
+
+struct aaudio_msg aaudio_reply_alloc(void);
+void aaudio_reply_free(struct aaudio_msg *reply);
+
+#endif //AAUDIO_PROTOCOL_BCE_H
diff --git a/drivers/staging/apple-bce/mailbox.c b/drivers/staging/apple-bce/mailbox.c
new file mode 100644
index 000000000000..e24bd35215c0
--- /dev/null
+++ b/drivers/staging/apple-bce/mailbox.c
@@ -0,0 +1,151 @@
+#include "mailbox.h"
+#include <linux/atomic.h>
+#include "apple_bce.h"
+
+#define REG_MBOX_OUT_BASE 0x820
+#define REG_MBOX_REPLY_COUNTER 0x108
+#define REG_MBOX_REPLY_BASE 0x810
+#define REG_TIMESTAMP_BASE 0xC000
+
+#define BCE_MBOX_TIMEOUT_MS 200
+
+void bce_mailbox_init(struct bce_mailbox *mb, void __iomem *reg_mb)
+{
+    mb->reg_mb = reg_mb;
+    init_completion(&mb->mb_completion);
+}
+
+int bce_mailbox_send(struct bce_mailbox *mb, u64 msg, u64* recv)
+{
+    u32 __iomem *regb;
+
+    if (atomic_cmpxchg(&mb->mb_status, 0, 1) != 0) {
+        return -EEXIST; // We don't support two messages at once
+    }
+    reinit_completion(&mb->mb_completion);
+
+    pr_debug("bce_mailbox_send: %llx\n", msg);
+    regb = (u32*) ((u8*) mb->reg_mb + REG_MBOX_OUT_BASE);
+    iowrite32((u32) msg, regb);
+    iowrite32((u32) (msg >> 32), regb + 1);
+    iowrite32(0, regb + 2);
+    iowrite32(0, regb + 3);
+
+    wait_for_completion_timeout(&mb->mb_completion, msecs_to_jiffies(BCE_MBOX_TIMEOUT_MS));
+    if (atomic_read(&mb->mb_status) != 2) { // Didn't get the reply
+        atomic_set(&mb->mb_status, 0);
+        return -ETIMEDOUT;
+    }
+
+    *recv = mb->mb_result;
+    pr_debug("bce_mailbox_send: reply %llx\n", *recv);
+
+    atomic_set(&mb->mb_status, 0);
+    return 0;
+}
+
+static int bce_mailbox_retrive_response(struct bce_mailbox *mb)
+{
+    u32 __iomem *regb;
+    u32 lo, hi;
+    int count, counter;
+    u32 res = ioread32((u8*) mb->reg_mb + REG_MBOX_REPLY_COUNTER);
+    count = (res >> 20) & 0xf;
+    counter = count;
+    pr_debug("bce_mailbox_retrive_response count=%i\n", count);
+    while (counter--) {
+        regb = (u32*) ((u8*) mb->reg_mb + REG_MBOX_REPLY_BASE);
+        lo = ioread32(regb);
+        hi = ioread32(regb + 1);
+        ioread32(regb + 2);
+        ioread32(regb + 3);
+        pr_debug("bce_mailbox_retrive_response %llx\n", ((u64) hi << 32) | lo);
+        mb->mb_result = ((u64) hi << 32) | lo;
+    }
+    return count > 0 ? 0 : -ENODATA;
+}
+
+int bce_mailbox_handle_interrupt(struct bce_mailbox *mb)
+{
+    int status = bce_mailbox_retrive_response(mb);
+    if (!status) {
+        atomic_set(&mb->mb_status, 2);
+        complete(&mb->mb_completion);
+    }
+    return status;
+}
+
+static void bc_send_timestamp(struct timer_list *tl);
+
+void bce_timestamp_init(struct bce_timestamp *ts, void __iomem *reg)
+{
+    u32 __iomem *regb;
+
+    spin_lock_init(&ts->stop_sl);
+    ts->stopped = false;
+
+    ts->reg = reg;
+
+    regb = (u32*) ((u8*) ts->reg + REG_TIMESTAMP_BASE);
+
+    ioread32(regb);
+    mb();
+
+    timer_setup(&ts->timer, bc_send_timestamp, 0);
+}
+
+void bce_timestamp_start(struct bce_timestamp *ts, bool is_initial)
+{
+    unsigned long flags;
+    u32 __iomem *regb = (u32*) ((u8*) ts->reg + REG_TIMESTAMP_BASE);
+
+    if (is_initial) {
+        iowrite32((u32) -4, regb + 2);
+        iowrite32((u32) -1, regb);
+    } else {
+        iowrite32((u32) -3, regb + 2);
+        iowrite32((u32) -1, regb);
+    }
+
+    spin_lock_irqsave(&ts->stop_sl, flags);
+    ts->stopped = false;
+    spin_unlock_irqrestore(&ts->stop_sl, flags);
+    mod_timer(&ts->timer, jiffies + msecs_to_jiffies(150));
+}
+
+void bce_timestamp_stop(struct bce_timestamp *ts)
+{
+    unsigned long flags;
+    u32 __iomem *regb = (u32*) ((u8*) ts->reg + REG_TIMESTAMP_BASE);
+
+    spin_lock_irqsave(&ts->stop_sl, flags);
+    ts->stopped = true;
+    spin_unlock_irqrestore(&ts->stop_sl, flags);
+    del_timer_sync(&ts->timer);
+
+    iowrite32((u32) -2, regb + 2);
+    iowrite32((u32) -1, regb);
+}
+
+static void bc_send_timestamp(struct timer_list *tl)
+{
+    struct bce_timestamp *ts;
+    unsigned long flags;
+    u32 __iomem *regb;
+    ktime_t bt;
+
+    ts = container_of(tl, struct bce_timestamp, timer);
+    regb = (u32*) ((u8*) ts->reg + REG_TIMESTAMP_BASE);
+    local_irq_save(flags);
+    ioread32(regb + 2);
+    mb();
+    bt = ktime_get_boottime();
+    iowrite32((u32) bt, regb + 2);
+    iowrite32((u32) (bt >> 32), regb);
+
+    spin_lock(&ts->stop_sl);
+    if (!ts->stopped)
+        mod_timer(&ts->timer, jiffies + msecs_to_jiffies(150));
+    spin_unlock(&ts->stop_sl);
+    local_irq_restore(flags);
+}
\ No newline at end of file
diff --git a/drivers/staging/apple-bce/mailbox.h b/drivers/staging/apple-bce/mailbox.h
new file mode 100644
index 000000000000..f3323f95ba51
--- /dev/null
+++ b/drivers/staging/apple-bce/mailbox.h
@@ -0,0 +1,53 @@
+#ifndef BCE_MAILBOX_H
+#define BCE_MAILBOX_H
+
+#include <linux/completion.h>
+#include <linux/pci.h>
+#include <linux/timer.h>
+
+struct bce_mailbox {
+    void __iomem *reg_mb;
+
+    atomic_t mb_status; // possible statuses: 0 (no msg), 1 (has active msg), 2 (got reply)
+    struct completion mb_completion;
+    uint64_t mb_result;
+};
+
+enum bce_message_type {
+    BCE_MB_REGISTER_COMMAND_SQ = 0x7,            // to-device
+    BCE_MB_REGISTER_COMMAND_CQ = 0x8,            // to-device
+    BCE_MB_REGISTER_COMMAND_QUEUE_REPLY = 0xB,   // to-host
+    BCE_MB_SET_FW_PROTOCOL_VERSION = 0xC,        // both
+    BCE_MB_SLEEP_NO_STATE = 0x14,                // to-device
+    BCE_MB_RESTORE_NO_STATE = 0x15,              // to-device
+    BCE_MB_SAVE_STATE_AND_SLEEP = 0x17,          // to-device
+    BCE_MB_RESTORE_STATE_AND_WAKE = 0x18,        // to-device
+    BCE_MB_SAVE_STATE_AND_SLEEP_FAILURE = 0x19,  // from-device
+    BCE_MB_SAVE_RESTORE_STATE_COMPLETE = 0x1A,   // from-device
+};
+
+#define BCE_MB_MSG(type, value) (((u64) (type) << 58) | ((value) & 0x3FFFFFFFFFFFFFFLL))
+#define BCE_MB_TYPE(v) ((u32) (v >> 58))
+#define BCE_MB_VALUE(v) (v & 0x3FFFFFFFFFFFFFFLL)
+
+void bce_mailbox_init(struct bce_mailbox *mb, void __iomem *reg_mb);
+
+int bce_mailbox_send(struct bce_mailbox *mb, u64 msg, u64* recv);
+
+int bce_mailbox_handle_interrupt(struct bce_mailbox *mb);
+
+
+struct bce_timestamp {
+    void __iomem *reg;
+    struct timer_list timer;
+    struct spinlock stop_sl;
+    bool stopped;
+};
+
+void bce_timestamp_init(struct bce_timestamp *ts, void __iomem *reg);
+
+void bce_timestamp_start(struct bce_timestamp *ts, bool is_initial);
+
+void bce_timestamp_stop(struct bce_timestamp *ts);
+
+#endif //BCEDRIVER_MAILBOX_H
diff --git a/drivers/staging/apple-bce/queue.c b/drivers/staging/apple-bce/queue.c
new file mode 100644
index 000000000000..bc9cd3bc6f0c
--- /dev/null
+++ b/drivers/staging/apple-bce/queue.c
@@ -0,0 +1,390 @@
+#include "queue.h"
+#include "apple_bce.h"
+
+#define REG_DOORBELL_BASE 0x44000
+
+struct bce_queue_cq *bce_alloc_cq(struct apple_bce_device *dev, int qid, u32 el_count)
+{
+    struct bce_queue_cq *q;
+    q = kzalloc(sizeof(struct bce_queue_cq), GFP_KERNEL);
+    q->qid = qid;
+    q->type = BCE_QUEUE_CQ;
+    q->el_count = el_count;
+    q->data = dma_alloc_coherent(&dev->pci->dev, el_count * sizeof(struct bce_qe_completion),
+            &q->dma_handle, GFP_KERNEL);
+    if (!q->data) {
+        pr_err("DMA queue memory alloc failed\n");
+        kfree(q);
+        return NULL;
+    }
+    return q;
+}
+
+void bce_get_cq_memcfg(struct bce_queue_cq *cq, struct bce_queue_memcfg *cfg)
+{
+    cfg->qid = (u16) cq->qid;
+    cfg->el_count = (u16) cq->el_count;
+    cfg->vector_or_cq = 0;
+    cfg->_pad = 0;
+    cfg->addr = cq->dma_handle;
+    cfg->length = cq->el_count * sizeof(struct bce_qe_completion);
+}
+
+void bce_free_cq(struct apple_bce_device *dev, struct bce_queue_cq *cq)
+{
+    dma_free_coherent(&dev->pci->dev, cq->el_count * sizeof(struct bce_qe_completion), cq->data, cq->dma_handle);
+    kfree(cq);
+}
+
+static void bce_handle_cq_completion(struct apple_bce_device *dev, struct bce_qe_completion *e, size_t *ce)
+{
+    struct bce_queue *target;
+    struct bce_queue_sq *target_sq;
+    struct bce_sq_completion_data *cmpl;
+    if (e->qid >= BCE_MAX_QUEUE_COUNT) {
+        pr_err("Device sent a response for qid (%u) >= BCE_MAX_QUEUE_COUNT\n", e->qid);
+        return;
+    }
+    target = dev->queues[e->qid];
+    if (!target || target->type != BCE_QUEUE_SQ) {
+        pr_err("Device sent a response for qid (%u), which does not exist\n", e->qid);
+        return;
+    }
+    target_sq = (struct bce_queue_sq *) target;
+    if (target_sq->completion_tail != e->completion_index) {
+        pr_err("Completion index mismatch; this is likely going to make this driver unusable\n");
+        return;
+    }
+    if (!target_sq->has_pending_completions) {
+        target_sq->has_pending_completions = true;
+        dev->int_sq_list[(*ce)++] = target_sq;
+    }
+    cmpl = &target_sq->completion_data[e->completion_index];
+    cmpl->status = e->status;
+    cmpl->data_size = e->data_size;
+    cmpl->result = e->result;
+    wmb();
+    target_sq->completion_tail = (target_sq->completion_tail + 1) % target_sq->el_count;
+}
+
+void bce_handle_cq_completions(struct apple_bce_device *dev, struct bce_queue_cq *cq)
+{
+    size_t ce = 0;
+    struct bce_qe_completion *e;
+    struct bce_queue_sq *sq;
+    e = bce_cq_element(cq, cq->index);
+    if (!(e->flags & BCE_COMPLETION_FLAG_PENDING))
+        return;
+    mb();
+    while (true) {
+        e = bce_cq_element(cq, cq->index);
+        if (!(e->flags & BCE_COMPLETION_FLAG_PENDING))
+            break;
+        // pr_info("apple-bce: compl: %i: %i %llx %llx", e->qid, e->status, e->data_size, e->result);
+        bce_handle_cq_completion(dev, e, &ce);
+        e->flags = 0;
+        cq->index = (cq->index + 1) % cq->el_count;
+    }
+    mb();
+    iowrite32(cq->index, (u32 *) ((u8 *) dev->reg_mem_dma +  REG_DOORBELL_BASE) + cq->qid);
+    while (ce) {
+        --ce;
+        sq = dev->int_sq_list[ce];
+        sq->completion(sq);
+        sq->has_pending_completions = false;
+    }
+}
+
+
+struct bce_queue_sq *bce_alloc_sq(struct apple_bce_device *dev, int qid, u32 el_size, u32 el_count,
+        bce_sq_completion compl, void *userdata)
+{
+    struct bce_queue_sq *q;
+    q = kzalloc(sizeof(struct bce_queue_sq), GFP_KERNEL);
+    q->qid = qid;
+    q->type = BCE_QUEUE_SQ;
+    q->el_size = el_size;
+    q->el_count = el_count;
+    q->data = dma_alloc_coherent(&dev->pci->dev, el_count * el_size,
+                                 &q->dma_handle, GFP_KERNEL);
+    q->completion = compl;
+    q->userdata = userdata;
+    q->completion_data = kzalloc(sizeof(struct bce_sq_completion_data) * el_count, GFP_KERNEL);
+    q->reg_mem_dma = dev->reg_mem_dma;
+    atomic_set(&q->available_commands, el_count - 1);
+    init_completion(&q->available_command_completion);
+    atomic_set(&q->available_command_completion_waiting_count, 0);
+    if (!q->data) {
+        pr_err("DMA queue memory alloc failed\n");
+        kfree(q);
+        return NULL;
+    }
+    return q;
+}
+
+void bce_get_sq_memcfg(struct bce_queue_sq *sq, struct bce_queue_cq *cq, struct bce_queue_memcfg *cfg)
+{
+    cfg->qid = (u16) sq->qid;
+    cfg->el_count = (u16) sq->el_count;
+    cfg->vector_or_cq = (u16) cq->qid;
+    cfg->_pad = 0;
+    cfg->addr = sq->dma_handle;
+    cfg->length = sq->el_count * sq->el_size;
+}
+
+void bce_free_sq(struct apple_bce_device *dev, struct bce_queue_sq *sq)
+{
+    dma_free_coherent(&dev->pci->dev, sq->el_count * sq->el_size, sq->data, sq->dma_handle);
+    kfree(sq);
+}
+
+int bce_reserve_submission(struct bce_queue_sq *sq, unsigned long *timeout)
+{
+    while (atomic_dec_if_positive(&sq->available_commands) < 0) {
+        if (!timeout || !*timeout)
+            return -EAGAIN;
+        atomic_inc(&sq->available_command_completion_waiting_count);
+        *timeout = wait_for_completion_timeout(&sq->available_command_completion, *timeout);
+        if (!*timeout) {
+            if (atomic_dec_if_positive(&sq->available_command_completion_waiting_count) < 0)
+                try_wait_for_completion(&sq->available_command_completion); /* consume the pending completion */
+        }
+    }
+    return 0;
+}
+
+void bce_cancel_submission_reservation(struct bce_queue_sq *sq)
+{
+    atomic_inc(&sq->available_commands);
+}
+
+void *bce_next_submission(struct bce_queue_sq *sq)
+{
+    void *ret = bce_sq_element(sq, sq->tail);
+    sq->tail = (sq->tail + 1) % sq->el_count;
+    return ret;
+}
+
+void bce_submit_to_device(struct bce_queue_sq *sq)
+{
+    mb();
+    iowrite32(sq->tail, (u32 *) ((u8 *) sq->reg_mem_dma +  REG_DOORBELL_BASE) + sq->qid);
+}
+
+void bce_notify_submission_complete(struct bce_queue_sq *sq)
+{
+    sq->head = (sq->head + 1) % sq->el_count;
+    atomic_inc(&sq->available_commands);
+    if (atomic_dec_if_positive(&sq->available_command_completion_waiting_count) >= 0) {
+        complete(&sq->available_command_completion);
+    }
+}
+
+void bce_set_submission_single(struct bce_qe_submission *element, dma_addr_t addr, size_t size)
+{
+    element->addr = addr;
+    element->length = size;
+    element->segl_addr = element->segl_length = 0;
+}
+
+static void bce_cmdq_completion(struct bce_queue_sq *q);
+
+struct bce_queue_cmdq *bce_alloc_cmdq(struct apple_bce_device *dev, int qid, u32 el_count)
+{
+    struct bce_queue_cmdq *q;
+    q = kzalloc(sizeof(struct bce_queue_cmdq), GFP_KERNEL);
+    q->sq = bce_alloc_sq(dev, qid, BCE_CMD_SIZE, el_count, bce_cmdq_completion, q);
+    if (!q->sq) {
+        kfree(q);
+        return NULL;
+    }
+    spin_lock_init(&q->lck);
+    q->tres = kzalloc(sizeof(struct bce_queue_cmdq_result_el*) * el_count, GFP_KERNEL);
+    if (!q->tres) {
+        kfree(q);
+        return NULL;
+    }
+    return q;
+}
+
+void bce_free_cmdq(struct apple_bce_device *dev, struct bce_queue_cmdq *cmdq)
+{
+    bce_free_sq(dev, cmdq->sq);
+    kfree(cmdq->tres);
+    kfree(cmdq);
+}
+
+void bce_cmdq_completion(struct bce_queue_sq *q)
+{
+    struct bce_queue_cmdq_result_el *el;
+    struct bce_queue_cmdq *cmdq = q->userdata;
+    struct bce_sq_completion_data *result;
+
+    spin_lock(&cmdq->lck);
+    while ((result = bce_next_completion(q))) {
+        el = cmdq->tres[cmdq->sq->head];
+        if (el) {
+            el->result = result->result;
+            el->status = result->status;
+            mb();
+            complete(&el->cmpl);
+        } else {
+            pr_err("apple-bce: Unexpected command queue completion\n");
+        }
+        cmdq->tres[cmdq->sq->head] = NULL;
+        bce_notify_submission_complete(q);
+    }
+    spin_unlock(&cmdq->lck);
+}
+
+static __always_inline void *bce_cmd_start(struct bce_queue_cmdq *cmdq, struct bce_queue_cmdq_result_el *res)
+{
+    void *ret;
+    unsigned long timeout;
+    init_completion(&res->cmpl);
+    mb();
+
+    timeout = msecs_to_jiffies(1000L * 60 * 5); /* wait for up to ~5 minutes */
+    if (bce_reserve_submission(cmdq->sq, &timeout))
+        return NULL;
+
+    spin_lock(&cmdq->lck);
+    cmdq->tres[cmdq->sq->tail] = res;
+    ret = bce_next_submission(cmdq->sq);
+    return ret;
+}
+
+static __always_inline void bce_cmd_finish(struct bce_queue_cmdq *cmdq, struct bce_queue_cmdq_result_el *res)
+{
+    bce_submit_to_device(cmdq->sq);
+    spin_unlock(&cmdq->lck);
+
+    wait_for_completion(&res->cmpl);
+    mb();
+}
+
+u32 bce_cmd_register_queue(struct bce_queue_cmdq *cmdq, struct bce_queue_memcfg *cfg, const char *name, bool isdirout)
+{
+    struct bce_queue_cmdq_result_el res;
+    struct bce_cmdq_register_memory_queue_cmd *cmd = bce_cmd_start(cmdq, &res);
+    if (!cmd)
+        return (u32) -1;
+    cmd->cmd = BCE_CMD_REGISTER_MEMORY_QUEUE;
+    cmd->flags = (u16) ((name ? 2 : 0) | (isdirout ? 1 : 0));
+    cmd->qid = cfg->qid;
+    cmd->el_count = cfg->el_count;
+    cmd->vector_or_cq = cfg->vector_or_cq;
+    memset(cmd->name, 0, sizeof(cmd->name));
+    if (name) {
+        cmd->name_len = (u16) min(strlen(name), (size_t) sizeof(cmd->name));
+        memcpy(cmd->name, name, cmd->name_len);
+    } else {
+        cmd->name_len = 0;
+    }
+    cmd->addr = cfg->addr;
+    cmd->length = cfg->length;
+
+    bce_cmd_finish(cmdq, &res);
+    return res.status;
+}
+
+u32 bce_cmd_unregister_memory_queue(struct bce_queue_cmdq *cmdq, u16 qid)
+{
+    struct bce_queue_cmdq_result_el res;
+    struct bce_cmdq_simple_memory_queue_cmd *cmd = bce_cmd_start(cmdq, &res);
+    if (!cmd)
+        return (u32) -1;
+    cmd->cmd = BCE_CMD_UNREGISTER_MEMORY_QUEUE;
+    cmd->flags = 0;
+    cmd->qid = qid;
+    bce_cmd_finish(cmdq, &res);
+    return res.status;
+}
+
+u32 bce_cmd_flush_memory_queue(struct bce_queue_cmdq *cmdq, u16 qid)
+{
+    struct bce_queue_cmdq_result_el res;
+    struct bce_cmdq_simple_memory_queue_cmd *cmd = bce_cmd_start(cmdq, &res);
+    if (!cmd)
+        return (u32) -1;
+    cmd->cmd = BCE_CMD_FLUSH_MEMORY_QUEUE;
+    cmd->flags = 0;
+    cmd->qid = qid;
+    bce_cmd_finish(cmdq, &res);
+    return res.status;
+}
+
+
+struct bce_queue_cq *bce_create_cq(struct apple_bce_device *dev, u32 el_count)
+{
+    struct bce_queue_cq *cq;
+    struct bce_queue_memcfg cfg;
+    int qid = ida_simple_get(&dev->queue_ida, BCE_QUEUE_USER_MIN, BCE_QUEUE_USER_MAX, GFP_KERNEL);
+    if (qid < 0)
+        return NULL;
+    cq = bce_alloc_cq(dev, qid, el_count);
+    if (!cq)
+        return NULL;
+    bce_get_cq_memcfg(cq, &cfg);
+    if (bce_cmd_register_queue(dev->cmd_cmdq, &cfg, NULL, false) != 0) {
+        pr_err("apple-bce: CQ registration failed (%i)", qid);
+        bce_free_cq(dev, cq);
+        ida_simple_remove(&dev->queue_ida, (uint) qid);
+        return NULL;
+    }
+    dev->queues[qid] = (struct bce_queue *) cq;
+    return cq;
+}
+
+struct bce_queue_sq *bce_create_sq(struct apple_bce_device *dev, struct bce_queue_cq *cq, const char *name, u32 el_count,
+        int direction, bce_sq_completion compl, void *userdata)
+{
+    struct bce_queue_sq *sq;
+    struct bce_queue_memcfg cfg;
+    int qid;
+    if (cq == NULL)
+        return NULL; /* cq can not be null */
+    if (name == NULL)
+        return NULL; /* name can not be null */
+    if (direction != DMA_TO_DEVICE && direction != DMA_FROM_DEVICE)
+        return NULL; /* unsupported direction */
+    qid = ida_simple_get(&dev->queue_ida, BCE_QUEUE_USER_MIN, BCE_QUEUE_USER_MAX, GFP_KERNEL);
+    if (qid < 0)
+        return NULL;
+    sq = bce_alloc_sq(dev, qid, sizeof(struct bce_qe_submission), el_count, compl, userdata);
+    if (!sq)
+        return NULL;
+    bce_get_sq_memcfg(sq, cq, &cfg);
+    if (bce_cmd_register_queue(dev->cmd_cmdq, &cfg, name, direction != DMA_FROM_DEVICE) != 0) {
+        pr_err("apple-bce: SQ registration failed (%i)", qid);
+        bce_free_sq(dev, sq);
+        ida_simple_remove(&dev->queue_ida, (uint) qid);
+        return NULL;
+    }
+    spin_lock(&dev->queues_lock);
+    dev->queues[qid] = (struct bce_queue *) sq;
+    spin_unlock(&dev->queues_lock);
+    return sq;
+}
+
+void bce_destroy_cq(struct apple_bce_device *dev, struct bce_queue_cq *cq)
+{
+    if (!dev->is_being_removed && bce_cmd_unregister_memory_queue(dev->cmd_cmdq, (u16) cq->qid))
+        pr_err("apple-bce: CQ unregister failed");
+    spin_lock(&dev->queues_lock);
+    dev->queues[cq->qid] = NULL;
+    spin_unlock(&dev->queues_lock);
+    ida_simple_remove(&dev->queue_ida, (uint) cq->qid);
+    bce_free_cq(dev, cq);
+}
+
+void bce_destroy_sq(struct apple_bce_device *dev, struct bce_queue_sq *sq)
+{
+    if (!dev->is_being_removed && bce_cmd_unregister_memory_queue(dev->cmd_cmdq, (u16) sq->qid))
+        pr_err("apple-bce: CQ unregister failed");
+    spin_lock(&dev->queues_lock);
+    dev->queues[sq->qid] = NULL;
+    spin_unlock(&dev->queues_lock);
+    ida_simple_remove(&dev->queue_ida, (uint) sq->qid);
+    bce_free_sq(dev, sq);
+}
\ No newline at end of file
diff --git a/drivers/staging/apple-bce/queue.h b/drivers/staging/apple-bce/queue.h
new file mode 100644
index 000000000000..8368ac5dfca8
--- /dev/null
+++ b/drivers/staging/apple-bce/queue.h
@@ -0,0 +1,177 @@
+#ifndef BCE_QUEUE_H
+#define BCE_QUEUE_H
+
+#include <linux/completion.h>
+#include <linux/pci.h>
+
+#define BCE_CMD_SIZE 0x40
+
+struct apple_bce_device;
+
+enum bce_queue_type {
+    BCE_QUEUE_CQ, BCE_QUEUE_SQ
+};
+struct bce_queue {
+    int qid;
+    int type;
+};
+struct bce_queue_cq {
+    int qid;
+    int type;
+    u32 el_count;
+    dma_addr_t dma_handle;
+    void *data;
+
+    u32 index;
+};
+struct bce_queue_sq;
+typedef void (*bce_sq_completion)(struct bce_queue_sq *q);
+struct bce_sq_completion_data {
+    u32 status;
+    u64 data_size;
+    u64 result;
+};
+struct bce_queue_sq {
+    int qid;
+    int type;
+    u32 el_size;
+    u32 el_count;
+    dma_addr_t dma_handle;
+    void *data;
+    void *userdata;
+    void __iomem *reg_mem_dma;
+
+    atomic_t available_commands;
+    struct completion available_command_completion;
+    atomic_t available_command_completion_waiting_count;
+    u32 head, tail;
+
+    u32 completion_cidx, completion_tail;
+    struct bce_sq_completion_data *completion_data;
+    bool has_pending_completions;
+    bce_sq_completion completion;
+};
+
+struct bce_queue_cmdq_result_el {
+    struct completion cmpl;
+    u32 status;
+    u64 result;
+};
+struct bce_queue_cmdq {
+    struct bce_queue_sq *sq;
+    struct spinlock lck;
+    struct bce_queue_cmdq_result_el **tres;
+};
+
+struct bce_queue_memcfg {
+    u16 qid;
+    u16 el_count;
+    u16 vector_or_cq;
+    u16 _pad;
+    u64 addr;
+    u64 length;
+};
+
+enum bce_qe_completion_status {
+    BCE_COMPLETION_SUCCESS = 0,
+    BCE_COMPLETION_ERROR = 1,
+    BCE_COMPLETION_ABORTED = 2,
+    BCE_COMPLETION_NO_SPACE = 3,
+    BCE_COMPLETION_OVERRUN = 4
+};
+enum bce_qe_completion_flags {
+    BCE_COMPLETION_FLAG_PENDING = 0x8000
+};
+struct bce_qe_completion {
+    u64 result;
+    u64 data_size;
+    u16 qid;
+    u16 completion_index;
+    u16 status; // bce_qe_completion_status
+    u16 flags;  // bce_qe_completion_flags
+};
+
+struct bce_qe_submission {
+    u64 length;
+    u64 addr;
+
+    u64 segl_addr;
+    u64 segl_length;
+};
+
+enum bce_cmdq_command {
+    BCE_CMD_REGISTER_MEMORY_QUEUE = 0x20,
+    BCE_CMD_UNREGISTER_MEMORY_QUEUE = 0x30,
+    BCE_CMD_FLUSH_MEMORY_QUEUE = 0x40,
+    BCE_CMD_SET_MEMORY_QUEUE_PROPERTY = 0x50
+};
+struct bce_cmdq_simple_memory_queue_cmd {
+    u16 cmd; // bce_cmdq_command
+    u16 flags;
+    u16 qid;
+};
+struct bce_cmdq_register_memory_queue_cmd {
+    u16 cmd; // bce_cmdq_command
+    u16 flags;
+    u16 qid;
+    u16 _pad;
+    u16 el_count;
+    u16 vector_or_cq;
+    u16 _pad2;
+    u16 name_len;
+    char name[0x20];
+    u64 addr;
+    u64 length;
+};
+
+static __always_inline void *bce_sq_element(struct bce_queue_sq *q, int i) {
+    return (void *) ((u8 *) q->data + q->el_size * i);
+}
+static __always_inline void *bce_cq_element(struct bce_queue_cq *q, int i) {
+    return (void *) ((struct bce_qe_completion *) q->data + i);
+}
+
+static __always_inline struct bce_sq_completion_data *bce_next_completion(struct bce_queue_sq *sq) {
+    struct bce_sq_completion_data *res;
+    rmb();
+    if (sq->completion_cidx == sq->completion_tail)
+        return NULL;
+    res = &sq->completion_data[sq->completion_cidx];
+    sq->completion_cidx = (sq->completion_cidx + 1) % sq->el_count;
+    return res;
+}
+
+struct bce_queue_cq *bce_alloc_cq(struct apple_bce_device *dev, int qid, u32 el_count);
+void bce_get_cq_memcfg(struct bce_queue_cq *cq, struct bce_queue_memcfg *cfg);
+void bce_free_cq(struct apple_bce_device *dev, struct bce_queue_cq *cq);
+void bce_handle_cq_completions(struct apple_bce_device *dev, struct bce_queue_cq *cq);
+
+struct bce_queue_sq *bce_alloc_sq(struct apple_bce_device *dev, int qid, u32 el_size, u32 el_count,
+        bce_sq_completion compl, void *userdata);
+void bce_get_sq_memcfg(struct bce_queue_sq *sq, struct bce_queue_cq *cq, struct bce_queue_memcfg *cfg);
+void bce_free_sq(struct apple_bce_device *dev, struct bce_queue_sq *sq);
+int bce_reserve_submission(struct bce_queue_sq *sq, unsigned long *timeout);
+void bce_cancel_submission_reservation(struct bce_queue_sq *sq);
+void *bce_next_submission(struct bce_queue_sq *sq);
+void bce_submit_to_device(struct bce_queue_sq *sq);
+void bce_notify_submission_complete(struct bce_queue_sq *sq);
+
+void bce_set_submission_single(struct bce_qe_submission *element, dma_addr_t addr, size_t size);
+
+struct bce_queue_cmdq *bce_alloc_cmdq(struct apple_bce_device *dev, int qid, u32 el_count);
+void bce_free_cmdq(struct apple_bce_device *dev, struct bce_queue_cmdq *cmdq);
+
+u32 bce_cmd_register_queue(struct bce_queue_cmdq *cmdq, struct bce_queue_memcfg *cfg, const char *name, bool isdirout);
+u32 bce_cmd_unregister_memory_queue(struct bce_queue_cmdq *cmdq, u16 qid);
+u32 bce_cmd_flush_memory_queue(struct bce_queue_cmdq *cmdq, u16 qid);
+
+
+/* User API - Creates and registers the queue */
+
+struct bce_queue_cq *bce_create_cq(struct apple_bce_device *dev, u32 el_count);
+struct bce_queue_sq *bce_create_sq(struct apple_bce_device *dev, struct bce_queue_cq *cq, const char *name, u32 el_count,
+        int direction, bce_sq_completion compl, void *userdata);
+void bce_destroy_cq(struct apple_bce_device *dev, struct bce_queue_cq *cq);
+void bce_destroy_sq(struct apple_bce_device *dev, struct bce_queue_sq *sq);
+
+#endif //BCEDRIVER_MAILBOX_H
diff --git a/drivers/staging/apple-bce/queue_dma.c b/drivers/staging/apple-bce/queue_dma.c
new file mode 100644
index 000000000000..b236613285c0
--- /dev/null
+++ b/drivers/staging/apple-bce/queue_dma.c
@@ -0,0 +1,220 @@
+#include "queue_dma.h"
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include "queue.h"
+
+static int bce_alloc_scatterlist_from_vm(struct sg_table *tbl, void *data, size_t len);
+static struct bce_segment_list_element_hostinfo *bce_map_segment_list(
+        struct device *dev, struct scatterlist *pages, int pagen);
+static void bce_unmap_segement_list(struct device *dev, struct bce_segment_list_element_hostinfo *list);
+
+int bce_map_dma_buffer(struct device *dev, struct bce_dma_buffer *buf, struct sg_table scatterlist,
+        enum dma_data_direction dir)
+{
+    int cnt;
+
+    buf->direction = dir;
+    buf->scatterlist = scatterlist;
+    buf->seglist_hostinfo = NULL;
+
+    cnt = dma_map_sg(dev, buf->scatterlist.sgl, buf->scatterlist.nents, dir);
+    if (cnt != buf->scatterlist.nents) {
+        pr_err("apple-bce: DMA scatter list mapping returned an unexpected count: %i\n", cnt);
+        dma_unmap_sg(dev, buf->scatterlist.sgl, buf->scatterlist.nents, dir);
+        return -EIO;
+    }
+    if (cnt == 1)
+        return 0;
+
+    buf->seglist_hostinfo = bce_map_segment_list(dev, buf->scatterlist.sgl, buf->scatterlist.nents);
+    if (!buf->seglist_hostinfo) {
+        pr_err("apple-bce: Creating segment list failed\n");
+        dma_unmap_sg(dev, buf->scatterlist.sgl, buf->scatterlist.nents, dir);
+        return -EIO;
+    }
+    return 0;
+}
+
+int bce_map_dma_buffer_vm(struct device *dev, struct bce_dma_buffer *buf, void *data, size_t len,
+                          enum dma_data_direction dir)
+{
+    int status;
+    struct sg_table scatterlist;
+    if ((status = bce_alloc_scatterlist_from_vm(&scatterlist, data, len)))
+        return status;
+    if ((status = bce_map_dma_buffer(dev, buf, scatterlist, dir))) {
+        sg_free_table(&scatterlist);
+        return status;
+    }
+    return 0;
+}
+
+int bce_map_dma_buffer_km(struct device *dev, struct bce_dma_buffer *buf, void *data, size_t len,
+                          enum dma_data_direction dir)
+{
+    /* Kernel memory is continuous which is great for us. */
+    int status;
+    struct sg_table scatterlist;
+    if ((status = sg_alloc_table(&scatterlist, 1, GFP_KERNEL))) {
+        sg_free_table(&scatterlist);
+        return status;
+    }
+    sg_set_buf(scatterlist.sgl, data, (uint) len);
+    if ((status = bce_map_dma_buffer(dev, buf, scatterlist, dir))) {
+        sg_free_table(&scatterlist);
+        return status;
+    }
+    return 0;
+}
+
+void bce_unmap_dma_buffer(struct device *dev, struct bce_dma_buffer *buf)
+{
+    dma_unmap_sg(dev, buf->scatterlist.sgl, buf->scatterlist.nents, buf->direction);
+    bce_unmap_segement_list(dev, buf->seglist_hostinfo);
+}
+
+
+static int bce_alloc_scatterlist_from_vm(struct sg_table *tbl, void *data, size_t len)
+{
+    int status, i;
+    struct page **pages;
+    size_t off, start_page, end_page, page_count;
+    off        = (size_t) data % PAGE_SIZE;
+    start_page = (size_t) data  / PAGE_SIZE;
+    end_page   = ((size_t) data + len - 1) / PAGE_SIZE;
+    page_count = end_page - start_page + 1;
+
+    if (page_count > PAGE_SIZE / sizeof(struct page *))
+        pages = vmalloc(page_count * sizeof(struct page *));
+    else
+        pages = kmalloc(page_count * sizeof(struct page *), GFP_KERNEL);
+
+    for (i = 0; i < page_count; i++)
+        pages[i] = vmalloc_to_page((void *) ((start_page + i) * PAGE_SIZE));
+
+    if ((status = sg_alloc_table_from_pages(tbl, pages, page_count, (unsigned int) off, len, GFP_KERNEL))) {
+        sg_free_table(tbl);
+    }
+
+    if (page_count > PAGE_SIZE / sizeof(struct page *))
+        vfree(pages);
+    else
+        kfree(pages);
+    return status;
+}
+
+#define BCE_ELEMENTS_PER_PAGE ((PAGE_SIZE - sizeof(struct bce_segment_list_header)) \
+                               / sizeof(struct bce_segment_list_element))
+#define BCE_ELEMENTS_PER_ADDITIONAL_PAGE (PAGE_SIZE / sizeof(struct bce_segment_list_element))
+
+static struct bce_segment_list_element_hostinfo *bce_map_segment_list(
+        struct device *dev, struct scatterlist *pages, int pagen)
+{
+    size_t ptr, pptr = 0;
+    struct bce_segment_list_header theader; /* a temp header, to store the initial seg */
+    struct bce_segment_list_header *header;
+    struct bce_segment_list_element *el, *el_end;
+    struct bce_segment_list_element_hostinfo *out, *pout, *out_root;
+    struct scatterlist *sg;
+    int i;
+    header = &theader;
+    out = out_root = NULL;
+    el = el_end = NULL;
+    for_each_sg(pages, sg, pagen, i) {
+        if (el >= el_end) {
+            /* allocate a new page, this will be also done for the first element */
+            ptr = __get_free_page(GFP_KERNEL);
+            if (pptr && ptr == pptr + PAGE_SIZE) {
+                out->page_count++;
+                header->element_count += BCE_ELEMENTS_PER_ADDITIONAL_PAGE;
+                el_end += BCE_ELEMENTS_PER_ADDITIONAL_PAGE;
+            } else {
+                header = (void *) ptr;
+                header->element_count = BCE_ELEMENTS_PER_PAGE;
+                header->data_size = 0;
+                header->next_segl_addr = 0;
+                header->next_segl_length = 0;
+                el = (void *) (header + 1);
+                el_end = el + BCE_ELEMENTS_PER_PAGE;
+
+                if (out) {
+                    out->next = kmalloc(sizeof(struct bce_segment_list_element_hostinfo), GFP_KERNEL);
+                    out = out->next;
+                } else {
+                    out_root = out = kmalloc(sizeof(struct bce_segment_list_element_hostinfo), GFP_KERNEL);
+                }
+                out->page_start = (void *) ptr;
+                out->page_count = 1;
+                out->dma_start = DMA_MAPPING_ERROR;
+                out->next = NULL;
+            }
+            pptr = ptr;
+        }
+        el->addr = sg->dma_address;
+        el->length = sg->length;
+        header->data_size += el->length;
+    }
+
+    /* DMA map */
+    out = out_root;
+    pout = NULL;
+    while (out) {
+        out->dma_start = dma_map_single(dev, out->page_start, out->page_count * PAGE_SIZE, DMA_TO_DEVICE);
+        if (dma_mapping_error(dev, out->dma_start))
+            goto error;
+        if (pout) {
+            header = pout->page_start;
+            header->next_segl_addr = out->dma_start;
+            header->next_segl_length = out->page_count * PAGE_SIZE;
+        }
+        pout = out;
+        out = out->next;
+    }
+    return out_root;
+
+    error:
+    bce_unmap_segement_list(dev, out_root);
+    return NULL;
+}
+
+static void bce_unmap_segement_list(struct device *dev, struct bce_segment_list_element_hostinfo *list)
+{
+    struct bce_segment_list_element_hostinfo *next;
+    while (list) {
+        if (list->dma_start != DMA_MAPPING_ERROR)
+            dma_unmap_single(dev, list->dma_start, list->page_count * PAGE_SIZE, DMA_TO_DEVICE);
+        next = list->next;
+        kfree(list);
+        list = next;
+    }
+}
+
+int bce_set_submission_buf(struct bce_qe_submission *element, struct bce_dma_buffer *buf, size_t offset, size_t length)
+{
+    struct bce_segment_list_element_hostinfo *seg;
+    struct bce_segment_list_header *seg_header;
+
+    seg = buf->seglist_hostinfo;
+    if (!seg) {
+        element->addr = buf->scatterlist.sgl->dma_address + offset;
+        element->length = length;
+        element->segl_addr = 0;
+        element->segl_length = 0;
+        return 0;
+    }
+
+    while (seg) {
+        seg_header = seg->page_start;
+        if (offset <= seg_header->data_size)
+            break;
+        offset -= seg_header->data_size;
+        seg = seg->next;
+    }
+    if (!seg)
+        return -EINVAL;
+    element->addr = offset;
+    element->length = buf->scatterlist.sgl->dma_length;
+    element->segl_addr = seg->dma_start;
+    element->segl_length = seg->page_count * PAGE_SIZE;
+    return 0;
+}
\ No newline at end of file
diff --git a/drivers/staging/apple-bce/queue_dma.h b/drivers/staging/apple-bce/queue_dma.h
new file mode 100644
index 000000000000..f8a57e50e7a3
--- /dev/null
+++ b/drivers/staging/apple-bce/queue_dma.h
@@ -0,0 +1,50 @@
+#ifndef BCE_QUEUE_DMA_H
+#define BCE_QUEUE_DMA_H
+
+#include <linux/pci.h>
+
+struct bce_qe_submission;
+
+struct bce_segment_list_header {
+    u64 element_count;
+    u64 data_size;
+
+    u64 next_segl_addr;
+    u64 next_segl_length;
+};
+struct bce_segment_list_element {
+    u64 addr;
+    u64 length;
+};
+
+struct bce_segment_list_element_hostinfo {
+    struct bce_segment_list_element_hostinfo *next;
+    void *page_start;
+    size_t page_count;
+    dma_addr_t dma_start;
+};
+
+
+struct bce_dma_buffer {
+    enum dma_data_direction direction;
+    struct sg_table scatterlist;
+    struct bce_segment_list_element_hostinfo *seglist_hostinfo;
+};
+
+/* NOTE: Takes ownership of the sg_table if it succeeds. Ownership is not transferred on failure. */
+int bce_map_dma_buffer(struct device *dev, struct bce_dma_buffer *buf, struct sg_table scatterlist,
+        enum dma_data_direction dir);
+
+/* Creates a buffer from virtual memory (vmalloc) */
+int bce_map_dma_buffer_vm(struct device *dev, struct bce_dma_buffer *buf, void *data, size_t len,
+        enum dma_data_direction dir);
+
+/* Creates a buffer from kernel memory (kmalloc) */
+int bce_map_dma_buffer_km(struct device *dev, struct bce_dma_buffer *buf, void *data, size_t len,
+                          enum dma_data_direction dir);
+
+void bce_unmap_dma_buffer(struct device *dev, struct bce_dma_buffer *buf);
+
+int bce_set_submission_buf(struct bce_qe_submission *element, struct bce_dma_buffer *buf, size_t offset, size_t length);
+
+#endif //BCE_QUEUE_DMA_H
diff --git a/drivers/staging/apple-bce/vhci/command.h b/drivers/staging/apple-bce/vhci/command.h
new file mode 100644
index 000000000000..26619e0bccfa
--- /dev/null
+++ b/drivers/staging/apple-bce/vhci/command.h
@@ -0,0 +1,204 @@
+#ifndef BCE_VHCI_COMMAND_H
+#define BCE_VHCI_COMMAND_H
+
+#include "queue.h"
+#include <linux/jiffies.h>
+#include <linux/usb.h>
+
+#define BCE_VHCI_CMD_TIMEOUT_SHORT msecs_to_jiffies(2000)
+#define BCE_VHCI_CMD_TIMEOUT_LONG msecs_to_jiffies(30000)
+
+#define BCE_VHCI_BULK_MAX_ACTIVE_URBS_POW2 2
+#define BCE_VHCI_BULK_MAX_ACTIVE_URBS (1 << BCE_VHCI_BULK_MAX_ACTIVE_URBS_POW2)
+
+typedef u8 bce_vhci_port_t;
+typedef u8 bce_vhci_device_t;
+
+enum bce_vhci_command {
+    BCE_VHCI_CMD_CONTROLLER_ENABLE = 1,
+    BCE_VHCI_CMD_CONTROLLER_DISABLE = 2,
+    BCE_VHCI_CMD_CONTROLLER_START = 3,
+    BCE_VHCI_CMD_CONTROLLER_PAUSE = 4,
+
+    BCE_VHCI_CMD_PORT_POWER_ON = 0x10,
+    BCE_VHCI_CMD_PORT_POWER_OFF = 0x11,
+    BCE_VHCI_CMD_PORT_RESUME = 0x12,
+    BCE_VHCI_CMD_PORT_SUSPEND = 0x13,
+    BCE_VHCI_CMD_PORT_RESET = 0x14,
+    BCE_VHCI_CMD_PORT_DISABLE = 0x15,
+    BCE_VHCI_CMD_PORT_STATUS = 0x16,
+
+    BCE_VHCI_CMD_DEVICE_CREATE = 0x30,
+    BCE_VHCI_CMD_DEVICE_DESTROY = 0x31,
+
+    BCE_VHCI_CMD_ENDPOINT_CREATE = 0x40,
+    BCE_VHCI_CMD_ENDPOINT_DESTROY = 0x41,
+    BCE_VHCI_CMD_ENDPOINT_SET_STATE = 0x42,
+    BCE_VHCI_CMD_ENDPOINT_RESET = 0x44,
+
+    /* Device to host only */
+    BCE_VHCI_CMD_ENDPOINT_REQUEST_STATE = 0x43,
+    BCE_VHCI_CMD_TRANSFER_REQUEST = 0x1000,
+    BCE_VHCI_CMD_CONTROL_TRANSFER_STATUS = 0x1005
+};
+
+enum bce_vhci_endpoint_state {
+    BCE_VHCI_ENDPOINT_ACTIVE = 0,
+    BCE_VHCI_ENDPOINT_PAUSED = 1,
+    BCE_VHCI_ENDPOINT_STALLED = 2
+};
+
+static inline int bce_vhci_cmd_controller_enable(struct bce_vhci_command_queue *q, u8 busNum, u16 *portMask)
+{
+    int status;
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_CONTROLLER_ENABLE;
+    cmd.param1 = 0x7100u | busNum;
+    status = bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_LONG);
+    if (!status)
+        *portMask = (u16) res.param2;
+    return status;
+}
+static inline int bce_vhci_cmd_controller_disable(struct bce_vhci_command_queue *q)
+{
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_CONTROLLER_DISABLE;
+    return bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_LONG);
+}
+static inline int bce_vhci_cmd_controller_start(struct bce_vhci_command_queue *q)
+{
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_CONTROLLER_START;
+    return bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_LONG);
+}
+static inline int bce_vhci_cmd_controller_pause(struct bce_vhci_command_queue *q)
+{
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_CONTROLLER_PAUSE;
+    return bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_LONG);
+}
+
+static inline int bce_vhci_cmd_port_power_on(struct bce_vhci_command_queue *q, bce_vhci_port_t port)
+{
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_PORT_POWER_ON;
+    cmd.param1 = port;
+    return bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_SHORT);
+}
+static inline int bce_vhci_cmd_port_power_off(struct bce_vhci_command_queue *q, bce_vhci_port_t port)
+{
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_PORT_POWER_OFF;
+    cmd.param1 = port;
+    return bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_SHORT);
+}
+static inline int bce_vhci_cmd_port_resume(struct bce_vhci_command_queue *q, bce_vhci_port_t port)
+{
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_PORT_RESUME;
+    cmd.param1 = port;
+    return bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_LONG);
+}
+static inline int bce_vhci_cmd_port_suspend(struct bce_vhci_command_queue *q, bce_vhci_port_t port)
+{
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_PORT_SUSPEND;
+    cmd.param1 = port;
+    return bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_LONG);
+}
+static inline int bce_vhci_cmd_port_reset(struct bce_vhci_command_queue *q, bce_vhci_port_t port, u32 timeout)
+{
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_PORT_RESET;
+    cmd.param1 = port;
+    cmd.param2 = timeout;
+    return bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_SHORT);
+}
+static inline int bce_vhci_cmd_port_disable(struct bce_vhci_command_queue *q, bce_vhci_port_t port)
+{
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_PORT_DISABLE;
+    cmd.param1 = port;
+    return bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_SHORT);
+}
+static inline int bce_vhci_cmd_port_status(struct bce_vhci_command_queue *q, bce_vhci_port_t port,
+        u32 clearFlags, u32 *resStatus)
+{
+    int status;
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_PORT_STATUS;
+    cmd.param1 = port;
+    cmd.param2 = clearFlags & 0x560000;
+    status = bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_SHORT);
+    if (status >= 0)
+        *resStatus = (u32) res.param2;
+    return status;
+}
+
+static inline int bce_vhci_cmd_device_create(struct bce_vhci_command_queue *q, bce_vhci_port_t port,
+        bce_vhci_device_t *dev)
+{
+    int status;
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_DEVICE_CREATE;
+    cmd.param1 = port;
+    status = bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_SHORT);
+    if (!status)
+        *dev = (bce_vhci_device_t) res.param2;
+    return status;
+}
+static inline int bce_vhci_cmd_device_destroy(struct bce_vhci_command_queue *q, bce_vhci_device_t dev)
+{
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_DEVICE_DESTROY;
+    cmd.param1 = dev;
+    return bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_LONG);
+}
+
+static inline int bce_vhci_cmd_endpoint_create(struct bce_vhci_command_queue *q, bce_vhci_device_t dev,
+        struct usb_endpoint_descriptor *desc)
+{
+    struct bce_vhci_message cmd, res;
+    int endpoint_type = usb_endpoint_type(desc);
+    int maxp = usb_endpoint_maxp(desc);
+    int maxp_burst = usb_endpoint_maxp_mult(desc) * maxp;
+    u8 max_active_requests_pow2 = 0;
+    cmd.cmd = BCE_VHCI_CMD_ENDPOINT_CREATE;
+    cmd.param1 = dev | ((desc->bEndpointAddress & 0x8Fu) << 8);
+    if (endpoint_type == USB_ENDPOINT_XFER_BULK)
+        max_active_requests_pow2 = BCE_VHCI_BULK_MAX_ACTIVE_URBS_POW2;
+    cmd.param2 = endpoint_type | ((max_active_requests_pow2 & 0xf) << 4) | (maxp << 16) | ((u64) maxp_burst << 32);
+    if (endpoint_type == USB_ENDPOINT_XFER_INT)
+        cmd.param2 |= (desc->bInterval - 1) << 8;
+    return bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_SHORT);
+}
+static inline int bce_vhci_cmd_endpoint_destroy(struct bce_vhci_command_queue *q, bce_vhci_device_t dev, u8 endpoint)
+{
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_ENDPOINT_DESTROY;
+    cmd.param1 = dev | (endpoint << 8);
+    return bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_SHORT);
+}
+static inline int bce_vhci_cmd_endpoint_set_state(struct bce_vhci_command_queue *q, bce_vhci_device_t dev, u8 endpoint,
+        enum bce_vhci_endpoint_state newState, enum bce_vhci_endpoint_state *retState)
+{
+    int status;
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_ENDPOINT_SET_STATE;
+    cmd.param1 = dev | (endpoint << 8);
+    cmd.param2 = (u64) newState;
+    status = bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_SHORT);
+    if (status != BCE_VHCI_INTERNAL_ERROR && status != BCE_VHCI_NO_POWER)
+        *retState = (enum bce_vhci_endpoint_state) res.param2;
+    return status;
+}
+static inline int bce_vhci_cmd_endpoint_reset(struct bce_vhci_command_queue *q, bce_vhci_device_t dev, u8 endpoint)
+{
+    struct bce_vhci_message cmd, res;
+    cmd.cmd = BCE_VHCI_CMD_ENDPOINT_RESET;
+    cmd.param1 = dev | (endpoint << 8);
+    return bce_vhci_command_queue_execute(q, &cmd, &res, BCE_VHCI_CMD_TIMEOUT_SHORT);
+}
+
+
+#endif //BCE_VHCI_COMMAND_H
diff --git a/drivers/staging/apple-bce/vhci/queue.c b/drivers/staging/apple-bce/vhci/queue.c
new file mode 100644
index 000000000000..7b0b5027157b
--- /dev/null
+++ b/drivers/staging/apple-bce/vhci/queue.c
@@ -0,0 +1,268 @@
+#include "queue.h"
+#include "vhci.h"
+#include "../apple_bce.h"
+
+
+static void bce_vhci_message_queue_completion(struct bce_queue_sq *sq);
+
+int bce_vhci_message_queue_create(struct bce_vhci *vhci, struct bce_vhci_message_queue *ret, const char *name)
+{
+    int status;
+    ret->cq = bce_create_cq(vhci->dev, VHCI_EVENT_QUEUE_EL_COUNT);
+    if (!ret->cq)
+        return -EINVAL;
+    ret->sq = bce_create_sq(vhci->dev, ret->cq, name, VHCI_EVENT_QUEUE_EL_COUNT, DMA_TO_DEVICE,
+                            bce_vhci_message_queue_completion, ret);
+    if (!ret->sq) {
+        status = -EINVAL;
+        goto fail_cq;
+    }
+    ret->data = dma_alloc_coherent(&vhci->dev->pci->dev, sizeof(struct bce_vhci_message) * VHCI_EVENT_QUEUE_EL_COUNT,
+                                   &ret->dma_addr, GFP_KERNEL);
+    if (!ret->data) {
+        status = -EINVAL;
+        goto fail_sq;
+    }
+    return 0;
+
+fail_sq:
+    bce_destroy_sq(vhci->dev, ret->sq);
+    ret->sq = NULL;
+fail_cq:
+    bce_destroy_cq(vhci->dev, ret->cq);
+    ret->cq = NULL;
+    return status;
+}
+
+void bce_vhci_message_queue_destroy(struct bce_vhci *vhci, struct bce_vhci_message_queue *q)
+{
+    if (!q->cq)
+        return;
+    dma_free_coherent(&vhci->dev->pci->dev, sizeof(struct bce_vhci_message) * VHCI_EVENT_QUEUE_EL_COUNT,
+                      q->data, q->dma_addr);
+    bce_destroy_sq(vhci->dev, q->sq);
+    bce_destroy_cq(vhci->dev, q->cq);
+}
+
+void bce_vhci_message_queue_write(struct bce_vhci_message_queue *q, struct bce_vhci_message *req)
+{
+    int sidx;
+    struct bce_qe_submission *s;
+    sidx = q->sq->tail;
+    s = bce_next_submission(q->sq);
+    pr_debug("bce-vhci: Send message: %x s=%x p1=%x p2=%llx\n", req->cmd, req->status, req->param1, req->param2);
+    q->data[sidx] = *req;
+    bce_set_submission_single(s, q->dma_addr + sizeof(struct bce_vhci_message) * sidx,
+            sizeof(struct bce_vhci_message));
+    bce_submit_to_device(q->sq);
+}
+
+static void bce_vhci_message_queue_completion(struct bce_queue_sq *sq)
+{
+    while (bce_next_completion(sq))
+        bce_notify_submission_complete(sq);
+}
+
+
+
+static void bce_vhci_event_queue_completion(struct bce_queue_sq *sq);
+
+int __bce_vhci_event_queue_create(struct bce_vhci *vhci, struct bce_vhci_event_queue *ret, const char *name,
+                                  bce_sq_completion compl)
+{
+    ret->vhci = vhci;
+
+    ret->sq = bce_create_sq(vhci->dev, vhci->ev_cq, name, VHCI_EVENT_QUEUE_EL_COUNT, DMA_FROM_DEVICE, compl, ret);
+    if (!ret->sq)
+        return -EINVAL;
+    ret->data = dma_alloc_coherent(&vhci->dev->pci->dev, sizeof(struct bce_vhci_message) * VHCI_EVENT_QUEUE_EL_COUNT,
+                                   &ret->dma_addr, GFP_KERNEL);
+    if (!ret->data) {
+        bce_destroy_sq(vhci->dev, ret->sq);
+        ret->sq = NULL;
+        return -EINVAL;
+    }
+
+    init_completion(&ret->queue_empty_completion);
+    bce_vhci_event_queue_submit_pending(ret, VHCI_EVENT_PENDING_COUNT);
+    return 0;
+}
+
+int bce_vhci_event_queue_create(struct bce_vhci *vhci, struct bce_vhci_event_queue *ret, const char *name,
+        bce_vhci_event_queue_callback cb)
+{
+    ret->cb = cb;
+    return __bce_vhci_event_queue_create(vhci, ret, name, bce_vhci_event_queue_completion);
+}
+
+void bce_vhci_event_queue_destroy(struct bce_vhci *vhci, struct bce_vhci_event_queue *q)
+{
+    if (!q->sq)
+        return;
+    dma_free_coherent(&vhci->dev->pci->dev, sizeof(struct bce_vhci_message) * VHCI_EVENT_QUEUE_EL_COUNT,
+                      q->data, q->dma_addr);
+    bce_destroy_sq(vhci->dev, q->sq);
+}
+
+static void bce_vhci_event_queue_completion(struct bce_queue_sq *sq)
+{
+    struct bce_sq_completion_data *cd;
+    struct bce_vhci_event_queue *ev = sq->userdata;
+    struct bce_vhci_message *msg;
+    size_t cnt = 0;
+
+    while ((cd = bce_next_completion(sq))) {
+        if (cd->status == BCE_COMPLETION_ABORTED) { /* We flushed the queue */
+            bce_notify_submission_complete(sq);
+            continue;
+        }
+        msg = &ev->data[sq->head];
+        pr_debug("bce-vhci: Got event: %x s=%x p1=%x p2=%llx\n", msg->cmd, msg->status, msg->param1, msg->param2);
+        ev->cb(ev, msg);
+
+        bce_notify_submission_complete(sq);
+        ++cnt;
+    }
+    bce_vhci_event_queue_submit_pending(ev, cnt);
+    if (atomic_read(&sq->available_commands) == sq->el_count - 1)
+        complete(&ev->queue_empty_completion);
+}
+
+void bce_vhci_event_queue_submit_pending(struct bce_vhci_event_queue *q, size_t count)
+{
+    int idx;
+    struct bce_qe_submission *s;
+    while (count--) {
+        if (bce_reserve_submission(q->sq, NULL)) {
+            pr_err("bce-vhci: Failed to reserve an event queue submission\n");
+            break;
+        }
+        idx = q->sq->tail;
+        s = bce_next_submission(q->sq);
+        bce_set_submission_single(s,
+                                  q->dma_addr + idx * sizeof(struct bce_vhci_message), sizeof(struct bce_vhci_message));
+    }
+    bce_submit_to_device(q->sq);
+}
+
+void bce_vhci_event_queue_pause(struct bce_vhci_event_queue *q)
+{
+    unsigned long timeout;
+    reinit_completion(&q->queue_empty_completion);
+    if (bce_cmd_flush_memory_queue(q->vhci->dev->cmd_cmdq, q->sq->qid))
+        pr_warn("bce-vhci: failed to flush event queue\n");
+    timeout = msecs_to_jiffies(5000);
+    while (atomic_read(&q->sq->available_commands) != q->sq->el_count - 1) {
+        timeout = wait_for_completion_timeout(&q->queue_empty_completion, timeout);
+        if (timeout == 0) {
+            pr_err("bce-vhci: waiting for queue to be flushed timed out\n");
+            break;
+        }
+    }
+}
+
+void bce_vhci_event_queue_resume(struct bce_vhci_event_queue *q)
+{
+    if (atomic_read(&q->sq->available_commands) != q->sq->el_count - 1) {
+        pr_err("bce-vhci: resume of a queue with pending submissions\n");
+        return;
+    }
+    bce_vhci_event_queue_submit_pending(q, VHCI_EVENT_PENDING_COUNT);
+}
+
+void bce_vhci_command_queue_create(struct bce_vhci_command_queue *ret, struct bce_vhci_message_queue *mq)
+{
+    ret->mq = mq;
+    ret->completion.result = NULL;
+    init_completion(&ret->completion.completion);
+    spin_lock_init(&ret->completion_lock);
+    mutex_init(&ret->mutex);
+}
+
+void bce_vhci_command_queue_destroy(struct bce_vhci_command_queue *cq)
+{
+    spin_lock(&cq->completion_lock);
+    if (cq->completion.result) {
+        memset(cq->completion.result, 0, sizeof(struct bce_vhci_message));
+        cq->completion.result->status = BCE_VHCI_ABORT;
+        complete(&cq->completion.completion);
+        cq->completion.result = NULL;
+    }
+    spin_unlock(&cq->completion_lock);
+    mutex_lock(&cq->mutex);
+    mutex_unlock(&cq->mutex);
+    mutex_destroy(&cq->mutex);
+}
+
+void bce_vhci_command_queue_deliver_completion(struct bce_vhci_command_queue *cq, struct bce_vhci_message *msg)
+{
+    struct bce_vhci_command_queue_completion *c = &cq->completion;
+
+    spin_lock(&cq->completion_lock);
+    if (c->result) {
+        *c->result = *msg;
+        complete(&c->completion);
+        c->result = NULL;
+    }
+    spin_unlock(&cq->completion_lock);
+}
+
+static int __bce_vhci_command_queue_execute(struct bce_vhci_command_queue *cq, struct bce_vhci_message *req,
+        struct bce_vhci_message *res, unsigned long timeout)
+{
+    int status;
+    struct bce_vhci_command_queue_completion *c;
+    struct bce_vhci_message creq;
+    c = &cq->completion;
+
+    if ((status = bce_reserve_submission(cq->mq->sq, &timeout)))
+        return status;
+
+    spin_lock(&cq->completion_lock);
+    c->result = res;
+    reinit_completion(&c->completion);
+    spin_unlock(&cq->completion_lock);
+
+    bce_vhci_message_queue_write(cq->mq, req);
+
+    if (!wait_for_completion_timeout(&c->completion, timeout)) {
+        /* we ran out of time, send cancellation */
+        pr_debug("bce-vhci: command timed out req=%x\n", req->cmd);
+        if ((status = bce_reserve_submission(cq->mq->sq, &timeout)))
+            return status;
+
+        creq = *req;
+        creq.cmd |= 0x4000;
+        bce_vhci_message_queue_write(cq->mq, &creq);
+
+        if (!wait_for_completion_timeout(&c->completion, 1000)) {
+            pr_err("bce-vhci: Possible desync, cmd cancel timed out\n");
+
+            spin_lock(&cq->completion_lock);
+            c->result = NULL;
+            spin_unlock(&cq->completion_lock);
+            return -ETIMEDOUT;
+        }
+        if ((res->cmd & ~0x8000) == creq.cmd)
+            return -ETIMEDOUT;
+        /* reply for the previous command most likely arrived */
+    }
+
+    if ((res->cmd & ~0x8000) != req->cmd) {
+        pr_err("bce-vhci: Possible desync, cmd reply mismatch req=%x, res=%x\n", req->cmd, res->cmd);
+        return -EIO;
+    }
+    if (res->status == BCE_VHCI_SUCCESS)
+        return 0;
+    return res->status;
+}
+
+int bce_vhci_command_queue_execute(struct bce_vhci_command_queue *cq, struct bce_vhci_message *req,
+                                   struct bce_vhci_message *res, unsigned long timeout)
+{
+    int status;
+    mutex_lock(&cq->mutex);
+    status = __bce_vhci_command_queue_execute(cq, req, res, timeout);
+    mutex_unlock(&cq->mutex);
+    return status;
+}
diff --git a/drivers/staging/apple-bce/vhci/queue.h b/drivers/staging/apple-bce/vhci/queue.h
new file mode 100644
index 000000000000..adb705b6ba1d
--- /dev/null
+++ b/drivers/staging/apple-bce/vhci/queue.h
@@ -0,0 +1,76 @@
+#ifndef BCE_VHCI_QUEUE_H
+#define BCE_VHCI_QUEUE_H
+
+#include <linux/completion.h>
+#include "../queue.h"
+
+#define VHCI_EVENT_QUEUE_EL_COUNT 256
+#define VHCI_EVENT_PENDING_COUNT 32
+
+struct bce_vhci;
+struct bce_vhci_event_queue;
+
+enum bce_vhci_message_status {
+    BCE_VHCI_SUCCESS = 1,
+    BCE_VHCI_ERROR = 2,
+    BCE_VHCI_USB_PIPE_STALL = 3,
+    BCE_VHCI_ABORT = 4,
+    BCE_VHCI_BAD_ARGUMENT = 5,
+    BCE_VHCI_OVERRUN = 6,
+    BCE_VHCI_INTERNAL_ERROR = 7,
+    BCE_VHCI_NO_POWER = 8,
+    BCE_VHCI_UNSUPPORTED = 9
+};
+struct bce_vhci_message {
+    u16 cmd;
+    u16 status; // bce_vhci_message_status
+    u32 param1;
+    u64 param2;
+};
+
+struct bce_vhci_message_queue {
+    struct bce_queue_cq *cq;
+    struct bce_queue_sq *sq;
+    struct bce_vhci_message *data;
+    dma_addr_t dma_addr;
+};
+typedef void (*bce_vhci_event_queue_callback)(struct bce_vhci_event_queue *q, struct bce_vhci_message *msg);
+struct bce_vhci_event_queue {
+    struct bce_vhci *vhci;
+    struct bce_queue_sq *sq;
+    struct bce_vhci_message *data;
+    dma_addr_t dma_addr;
+    bce_vhci_event_queue_callback cb;
+    struct completion queue_empty_completion;
+};
+struct bce_vhci_command_queue_completion {
+    struct bce_vhci_message *result;
+    struct completion completion;
+};
+struct bce_vhci_command_queue {
+    struct bce_vhci_message_queue *mq;
+    struct bce_vhci_command_queue_completion completion;
+    struct spinlock completion_lock;
+    struct mutex mutex;
+};
+
+int bce_vhci_message_queue_create(struct bce_vhci *vhci, struct bce_vhci_message_queue *ret, const char *name);
+void bce_vhci_message_queue_destroy(struct bce_vhci *vhci, struct bce_vhci_message_queue *q);
+void bce_vhci_message_queue_write(struct bce_vhci_message_queue *q, struct bce_vhci_message *req);
+
+int __bce_vhci_event_queue_create(struct bce_vhci *vhci, struct bce_vhci_event_queue *ret, const char *name,
+        bce_sq_completion compl);
+int bce_vhci_event_queue_create(struct bce_vhci *vhci, struct bce_vhci_event_queue *ret, const char *name,
+        bce_vhci_event_queue_callback cb);
+void bce_vhci_event_queue_destroy(struct bce_vhci *vhci, struct bce_vhci_event_queue *q);
+void bce_vhci_event_queue_submit_pending(struct bce_vhci_event_queue *q, size_t count);
+void bce_vhci_event_queue_pause(struct bce_vhci_event_queue *q);
+void bce_vhci_event_queue_resume(struct bce_vhci_event_queue *q);
+
+void bce_vhci_command_queue_create(struct bce_vhci_command_queue *ret, struct bce_vhci_message_queue *mq);
+void bce_vhci_command_queue_destroy(struct bce_vhci_command_queue *cq);
+int bce_vhci_command_queue_execute(struct bce_vhci_command_queue *cq, struct bce_vhci_message *req,
+        struct bce_vhci_message *res, unsigned long timeout);
+void bce_vhci_command_queue_deliver_completion(struct bce_vhci_command_queue *cq, struct bce_vhci_message *msg);
+
+#endif //BCE_VHCI_QUEUE_H
diff --git a/drivers/staging/apple-bce/vhci/transfer.c b/drivers/staging/apple-bce/vhci/transfer.c
new file mode 100644
index 000000000000..8226363d69c8
--- /dev/null
+++ b/drivers/staging/apple-bce/vhci/transfer.c
@@ -0,0 +1,661 @@
+#include "transfer.h"
+#include "../queue.h"
+#include "vhci.h"
+#include "../apple_bce.h"
+#include <linux/usb/hcd.h>
+
+static void bce_vhci_transfer_queue_completion(struct bce_queue_sq *sq);
+static void bce_vhci_transfer_queue_giveback(struct bce_vhci_transfer_queue *q);
+static void bce_vhci_transfer_queue_remove_pending(struct bce_vhci_transfer_queue *q);
+
+static int bce_vhci_urb_init(struct bce_vhci_urb *vurb);
+static int bce_vhci_urb_update(struct bce_vhci_urb *urb, struct bce_vhci_message *msg);
+static int bce_vhci_urb_transfer_completion(struct bce_vhci_urb *urb, struct bce_sq_completion_data *c);
+
+static void bce_vhci_transfer_queue_reset_w(struct work_struct *work);
+
+void bce_vhci_create_transfer_queue(struct bce_vhci *vhci, struct bce_vhci_transfer_queue *q,
+        struct usb_host_endpoint *endp, bce_vhci_device_t dev_addr, enum dma_data_direction dir)
+{
+    char name[0x21];
+    INIT_LIST_HEAD(&q->evq);
+    INIT_LIST_HEAD(&q->giveback_urb_list);
+    spin_lock_init(&q->urb_lock);
+    mutex_init(&q->pause_lock);
+    q->vhci = vhci;
+    q->endp = endp;
+    q->dev_addr = dev_addr;
+    q->endp_addr = (u8) (endp->desc.bEndpointAddress & 0x8F);
+    q->state = BCE_VHCI_ENDPOINT_ACTIVE;
+    q->active = true;
+    q->stalled = false;
+    q->max_active_requests = 1;
+    if (usb_endpoint_type(&endp->desc) == USB_ENDPOINT_XFER_BULK)
+        q->max_active_requests = BCE_VHCI_BULK_MAX_ACTIVE_URBS;
+    q->remaining_active_requests = q->max_active_requests;
+    q->cq = bce_create_cq(vhci->dev, 0x100);
+    INIT_WORK(&q->w_reset, bce_vhci_transfer_queue_reset_w);
+    q->sq_in = NULL;
+    if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) {
+        snprintf(name, sizeof(name), "VHC1-%i-%02x", dev_addr, 0x80 | usb_endpoint_num(&endp->desc));
+        q->sq_in = bce_create_sq(vhci->dev, q->cq, name, 0x100, DMA_FROM_DEVICE,
+                                 bce_vhci_transfer_queue_completion, q);
+    }
+    q->sq_out = NULL;
+    if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) {
+        snprintf(name, sizeof(name), "VHC1-%i-%02x", dev_addr, usb_endpoint_num(&endp->desc));
+        q->sq_out = bce_create_sq(vhci->dev, q->cq, name, 0x100, DMA_TO_DEVICE,
+                                  bce_vhci_transfer_queue_completion, q);
+    }
+}
+
+void bce_vhci_destroy_transfer_queue(struct bce_vhci *vhci, struct bce_vhci_transfer_queue *q)
+{
+    bce_vhci_transfer_queue_giveback(q);
+    bce_vhci_transfer_queue_remove_pending(q);
+    if (q->sq_in)
+        bce_destroy_sq(vhci->dev, q->sq_in);
+    if (q->sq_out)
+        bce_destroy_sq(vhci->dev, q->sq_out);
+    bce_destroy_cq(vhci->dev, q->cq);
+}
+
+static inline bool bce_vhci_transfer_queue_can_init_urb(struct bce_vhci_transfer_queue *q)
+{
+    return q->remaining_active_requests > 0;
+}
+
+static void bce_vhci_transfer_queue_defer_event(struct bce_vhci_transfer_queue *q, struct bce_vhci_message *msg)
+{
+    struct bce_vhci_list_message *lm;
+    lm = kmalloc(sizeof(struct bce_vhci_list_message), GFP_KERNEL);
+    INIT_LIST_HEAD(&lm->list);
+    lm->msg = *msg;
+    list_add_tail(&lm->list, &q->evq);
+}
+
+static void bce_vhci_transfer_queue_giveback(struct bce_vhci_transfer_queue *q)
+{
+    unsigned long flags;
+    struct urb *urb;
+    spin_lock_irqsave(&q->urb_lock, flags);
+    while (!list_empty(&q->giveback_urb_list)) {
+        urb = list_first_entry(&q->giveback_urb_list, struct urb, urb_list);
+        list_del(&urb->urb_list);
+
+        spin_unlock_irqrestore(&q->urb_lock, flags);
+        usb_hcd_giveback_urb(q->vhci->hcd, urb, urb->status);
+        spin_lock_irqsave(&q->urb_lock, flags);
+    }
+    spin_unlock_irqrestore(&q->urb_lock, flags);
+}
+
+static void bce_vhci_transfer_queue_init_pending_urbs(struct bce_vhci_transfer_queue *q);
+
+static void bce_vhci_transfer_queue_deliver_pending(struct bce_vhci_transfer_queue *q)
+{
+    struct urb *urb;
+    struct bce_vhci_list_message *lm;
+
+    while (!list_empty(&q->endp->urb_list) && !list_empty(&q->evq)) {
+        urb = list_first_entry(&q->endp->urb_list, struct urb, urb_list);
+
+        lm = list_first_entry(&q->evq, struct bce_vhci_list_message, list);
+        if (bce_vhci_urb_update(urb->hcpriv, &lm->msg) == -EAGAIN)
+            break;
+        list_del(&lm->list);
+        kfree(lm);
+    }
+
+    /* some of the URBs could have been completed, so initialize more URBs if possible */
+    bce_vhci_transfer_queue_init_pending_urbs(q);
+}
+
+static void bce_vhci_transfer_queue_remove_pending(struct bce_vhci_transfer_queue *q)
+{
+    unsigned long flags;
+    struct bce_vhci_list_message *lm;
+    spin_lock_irqsave(&q->urb_lock, flags);
+    while (!list_empty(&q->evq)) {
+        lm = list_first_entry(&q->evq, struct bce_vhci_list_message, list);
+        list_del(&lm->list);
+        kfree(lm);
+    }
+    spin_unlock_irqrestore(&q->urb_lock, flags);
+}
+
+void bce_vhci_transfer_queue_event(struct bce_vhci_transfer_queue *q, struct bce_vhci_message *msg)
+{
+    unsigned long flags;
+    struct bce_vhci_urb *turb;
+    struct urb *urb;
+    spin_lock_irqsave(&q->urb_lock, flags);
+    bce_vhci_transfer_queue_deliver_pending(q);
+
+    if (msg->cmd == BCE_VHCI_CMD_TRANSFER_REQUEST &&
+        (!list_empty(&q->evq) || list_empty(&q->endp->urb_list))) {
+        bce_vhci_transfer_queue_defer_event(q, msg);
+        goto complete;
+    }
+    if (list_empty(&q->endp->urb_list)) {
+        pr_err("bce-vhci: [%02x] Unexpected transfer queue event\n", q->endp_addr);
+        goto complete;
+    }
+    urb = list_first_entry(&q->endp->urb_list, struct urb, urb_list);
+    turb = urb->hcpriv;
+    if (bce_vhci_urb_update(turb, msg) == -EAGAIN) {
+        bce_vhci_transfer_queue_defer_event(q, msg);
+    } else {
+        bce_vhci_transfer_queue_init_pending_urbs(q);
+    }
+
+complete:
+    spin_unlock_irqrestore(&q->urb_lock, flags);
+    bce_vhci_transfer_queue_giveback(q);
+}
+
+static void bce_vhci_transfer_queue_completion(struct bce_queue_sq *sq)
+{
+    unsigned long flags;
+    struct bce_sq_completion_data *c;
+    struct urb *urb;
+    struct bce_vhci_transfer_queue *q = sq->userdata;
+    spin_lock_irqsave(&q->urb_lock, flags);
+    while ((c = bce_next_completion(sq))) {
+        if (c->status == BCE_COMPLETION_ABORTED) { /* We flushed the queue */
+            pr_debug("bce-vhci: [%02x] Got an abort completion\n", q->endp_addr);
+            bce_notify_submission_complete(sq);
+            continue;
+        }
+        if (list_empty(&q->endp->urb_list)) {
+            pr_err("bce-vhci: [%02x] Got a completion while no requests are pending\n", q->endp_addr);
+            continue;
+        }
+        pr_debug("bce-vhci: [%02x] Got a transfer queue completion\n", q->endp_addr);
+        urb = list_first_entry(&q->endp->urb_list, struct urb, urb_list);
+        bce_vhci_urb_transfer_completion(urb->hcpriv, c);
+        bce_notify_submission_complete(sq);
+    }
+    bce_vhci_transfer_queue_deliver_pending(q);
+    spin_unlock_irqrestore(&q->urb_lock, flags);
+    bce_vhci_transfer_queue_giveback(q);
+}
+
+int bce_vhci_transfer_queue_do_pause(struct bce_vhci_transfer_queue *q)
+{
+    unsigned long flags;
+    int status;
+    u8 endp_addr = (u8) (q->endp->desc.bEndpointAddress & 0x8F);
+    spin_lock_irqsave(&q->urb_lock, flags);
+    q->active = false;
+    spin_unlock_irqrestore(&q->urb_lock, flags);
+    if (q->sq_out) {
+        pr_err("bce-vhci: Not implemented: wait for pending output requests\n");
+    }
+    bce_vhci_transfer_queue_remove_pending(q);
+    if ((status = bce_vhci_cmd_endpoint_set_state(
+            &q->vhci->cq, q->dev_addr, endp_addr, BCE_VHCI_ENDPOINT_PAUSED, &q->state)))
+        return status;
+    if (q->state != BCE_VHCI_ENDPOINT_PAUSED)
+        return -EINVAL;
+    if (q->sq_in)
+        bce_cmd_flush_memory_queue(q->vhci->dev->cmd_cmdq, (u16) q->sq_in->qid);
+    if (q->sq_out)
+        bce_cmd_flush_memory_queue(q->vhci->dev->cmd_cmdq, (u16) q->sq_out->qid);
+    return 0;
+}
+
+static void bce_vhci_urb_resume(struct bce_vhci_urb *urb);
+
+int bce_vhci_transfer_queue_do_resume(struct bce_vhci_transfer_queue *q)
+{
+    unsigned long flags;
+    int status;
+    struct urb *urb, *urbt;
+    struct bce_vhci_urb *vurb;
+    u8 endp_addr = (u8) (q->endp->desc.bEndpointAddress & 0x8F);
+    if ((status = bce_vhci_cmd_endpoint_set_state(
+            &q->vhci->cq, q->dev_addr, endp_addr, BCE_VHCI_ENDPOINT_ACTIVE, &q->state)))
+        return status;
+    if (q->state != BCE_VHCI_ENDPOINT_ACTIVE)
+        return -EINVAL;
+    spin_lock_irqsave(&q->urb_lock, flags);
+    q->active = true;
+    list_for_each_entry_safe(urb, urbt, &q->endp->urb_list, urb_list) {
+        vurb = urb->hcpriv;
+        if (vurb->state == BCE_VHCI_URB_INIT_PENDING) {
+            if (!bce_vhci_transfer_queue_can_init_urb(q))
+                break;
+            bce_vhci_urb_init(vurb);
+        } else {
+            bce_vhci_urb_resume(vurb);
+        }
+    }
+    bce_vhci_transfer_queue_deliver_pending(q);
+    spin_unlock_irqrestore(&q->urb_lock, flags);
+    return 0;
+}
+
+int bce_vhci_transfer_queue_pause(struct bce_vhci_transfer_queue *q, enum bce_vhci_pause_source src)
+{
+    int ret = 0;
+    mutex_lock(&q->pause_lock);
+    if ((q->paused_by & src) != src) {
+        if (!q->paused_by)
+            ret = bce_vhci_transfer_queue_do_pause(q);
+        if (!ret)
+            q->paused_by |= src;
+    }
+    mutex_unlock(&q->pause_lock);
+    return ret;
+}
+
+int bce_vhci_transfer_queue_resume(struct bce_vhci_transfer_queue *q, enum bce_vhci_pause_source src)
+{
+    int ret = 0;
+    mutex_lock(&q->pause_lock);
+    if (q->paused_by & src) {
+        if (!(q->paused_by & ~src))
+            ret = bce_vhci_transfer_queue_do_resume(q);
+        if (!ret)
+            q->paused_by &= ~src;
+    }
+    mutex_unlock(&q->pause_lock);
+    return ret;
+}
+
+static void bce_vhci_transfer_queue_reset_w(struct work_struct *work)
+{
+    unsigned long flags;
+    struct bce_vhci_transfer_queue *q = container_of(work, struct bce_vhci_transfer_queue, w_reset);
+
+    mutex_lock(&q->pause_lock);
+    spin_lock_irqsave(&q->urb_lock, flags);
+    if (!q->stalled) {
+        spin_unlock_irqrestore(&q->urb_lock, flags);
+        mutex_unlock(&q->pause_lock);
+        return;
+    }
+    q->active = false;
+    spin_unlock_irqrestore(&q->urb_lock, flags);
+    q->paused_by |= BCE_VHCI_PAUSE_INTERNAL_WQ;
+    bce_vhci_transfer_queue_remove_pending(q);
+    if (q->sq_in)
+        bce_cmd_flush_memory_queue(q->vhci->dev->cmd_cmdq, (u16) q->sq_in->qid);
+    if (q->sq_out)
+        bce_cmd_flush_memory_queue(q->vhci->dev->cmd_cmdq, (u16) q->sq_out->qid);
+    bce_vhci_cmd_endpoint_reset(&q->vhci->cq, q->dev_addr, (u8) (q->endp->desc.bEndpointAddress & 0x8F));
+    spin_lock_irqsave(&q->urb_lock, flags);
+    q->stalled = false;
+    spin_unlock_irqrestore(&q->urb_lock, flags);
+    mutex_unlock(&q->pause_lock);
+    bce_vhci_transfer_queue_resume(q, BCE_VHCI_PAUSE_INTERNAL_WQ);
+}
+
+void bce_vhci_transfer_queue_request_reset(struct bce_vhci_transfer_queue *q)
+{
+    queue_work(q->vhci->tq_state_wq, &q->w_reset);
+}
+
+static void bce_vhci_transfer_queue_init_pending_urbs(struct bce_vhci_transfer_queue *q)
+{
+    struct urb *urb, *urbt;
+    struct bce_vhci_urb *vurb;
+    list_for_each_entry_safe(urb, urbt, &q->endp->urb_list, urb_list) {
+        vurb = urb->hcpriv;
+        if (!bce_vhci_transfer_queue_can_init_urb(q))
+            break;
+        if (vurb->state == BCE_VHCI_URB_INIT_PENDING)
+            bce_vhci_urb_init(vurb);
+    }
+}
+
+
+
+static int bce_vhci_urb_data_start(struct bce_vhci_urb *urb, unsigned long *timeout);
+
+int bce_vhci_urb_create(struct bce_vhci_transfer_queue *q, struct urb *urb)
+{
+    unsigned long flags;
+    int status = 0;
+    struct bce_vhci_urb *vurb;
+    vurb = kzalloc(sizeof(struct bce_vhci_urb), GFP_KERNEL);
+    urb->hcpriv = vurb;
+
+    vurb->q = q;
+    vurb->urb = urb;
+    vurb->dir = usb_urb_dir_in(urb) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+    vurb->is_control = (usb_endpoint_num(&urb->ep->desc) == 0);
+
+    spin_lock_irqsave(&q->urb_lock, flags);
+    status = usb_hcd_link_urb_to_ep(q->vhci->hcd, urb);
+    if (status) {
+        spin_unlock_irqrestore(&q->urb_lock, flags);
+        urb->hcpriv = NULL;
+        kfree(vurb);
+        return status;
+    }
+
+    if (q->active) {
+        if (bce_vhci_transfer_queue_can_init_urb(vurb->q))
+            status = bce_vhci_urb_init(vurb);
+        else
+            vurb->state = BCE_VHCI_URB_INIT_PENDING;
+    } else {
+        if (q->stalled)
+            bce_vhci_transfer_queue_request_reset(q);
+        vurb->state = BCE_VHCI_URB_INIT_PENDING;
+    }
+    if (status) {
+        usb_hcd_unlink_urb_from_ep(q->vhci->hcd, urb);
+        urb->hcpriv = NULL;
+        kfree(vurb);
+    } else {
+        bce_vhci_transfer_queue_deliver_pending(q);
+    }
+    spin_unlock_irqrestore(&q->urb_lock, flags);
+    pr_debug("bce-vhci: [%02x] URB enqueued (dir = %s, size = %i)\n", q->endp_addr,
+            usb_urb_dir_in(urb) ? "IN" : "OUT", urb->transfer_buffer_length);
+    return status;
+}
+
+static int bce_vhci_urb_init(struct bce_vhci_urb *vurb)
+{
+    int status = 0;
+
+    if (vurb->q->remaining_active_requests == 0) {
+        pr_err("bce-vhci: cannot init request (remaining_active_requests = 0)\n");
+        return -EINVAL;
+    }
+
+    if (vurb->is_control) {
+        vurb->state = BCE_VHCI_URB_CONTROL_WAITING_FOR_SETUP_REQUEST;
+    } else {
+        status = bce_vhci_urb_data_start(vurb, NULL);
+    }
+
+    if (!status) {
+        --vurb->q->remaining_active_requests;
+    }
+    return status;
+}
+
+static void bce_vhci_urb_complete(struct bce_vhci_urb *urb, int status)
+{
+    struct bce_vhci_transfer_queue *q = urb->q;
+    struct bce_vhci *vhci = q->vhci;
+    struct urb *real_urb = urb->urb;
+    pr_debug("bce-vhci: [%02x] URB complete %i\n", q->endp_addr, status);
+    usb_hcd_unlink_urb_from_ep(vhci->hcd, real_urb);
+    real_urb->hcpriv = NULL;
+    real_urb->status = status;
+    if (urb->state != BCE_VHCI_URB_INIT_PENDING)
+        ++urb->q->remaining_active_requests;
+    kfree(urb);
+    list_add_tail(&real_urb->urb_list, &q->giveback_urb_list);
+}
+
+int bce_vhci_urb_request_cancel(struct bce_vhci_transfer_queue *q, struct urb *urb, int status)
+{
+    struct bce_vhci_urb *vurb;
+    unsigned long flags;
+    int ret;
+
+    spin_lock_irqsave(&q->urb_lock, flags);
+    if ((ret = usb_hcd_check_unlink_urb(q->vhci->hcd, urb, status))) {
+        spin_unlock_irqrestore(&q->urb_lock, flags);
+        return ret;
+    }
+
+    vurb = urb->hcpriv;
+    /* If the URB wasn't posted to the device yet, we can still remove it on the host without pausing the queue. */
+    if (vurb->state != BCE_VHCI_URB_INIT_PENDING) {
+        pr_debug("bce-vhci: [%02x] Cancelling URB\n", q->endp_addr);
+
+        spin_unlock_irqrestore(&q->urb_lock, flags);
+        bce_vhci_transfer_queue_pause(q, BCE_VHCI_PAUSE_INTERNAL_WQ);
+        spin_lock_irqsave(&q->urb_lock, flags);
+
+        ++q->remaining_active_requests;
+    }
+
+    usb_hcd_unlink_urb_from_ep(q->vhci->hcd, urb);
+
+    spin_unlock_irqrestore(&q->urb_lock, flags);
+
+    usb_hcd_giveback_urb(q->vhci->hcd, urb, status);
+
+    if (vurb->state != BCE_VHCI_URB_INIT_PENDING)
+        bce_vhci_transfer_queue_resume(q, BCE_VHCI_PAUSE_INTERNAL_WQ);
+
+    kfree(vurb);
+
+    return 0;
+}
+
+static int bce_vhci_urb_data_transfer_in(struct bce_vhci_urb *urb, unsigned long *timeout)
+{
+    struct bce_vhci_message msg;
+    struct bce_qe_submission *s;
+    u32 tr_len;
+    int reservation1, reservation2 = -EFAULT;
+
+    pr_debug("bce-vhci: [%02x] DMA from device %llx %x\n", urb->q->endp_addr,
+             (u64) urb->urb->transfer_dma, urb->urb->transfer_buffer_length);
+
+    /* Reserve both a message and a submission, so we don't run into issues later. */
+    reservation1 = bce_reserve_submission(urb->q->vhci->msg_asynchronous.sq, timeout);
+    if (!reservation1)
+        reservation2 = bce_reserve_submission(urb->q->sq_in, timeout);
+    if (reservation1 || reservation2) {
+        pr_err("bce-vhci: Failed to reserve a submission for URB data transfer\n");
+        if (!reservation1)
+            bce_cancel_submission_reservation(urb->q->vhci->msg_asynchronous.sq);
+        return -ENOMEM;
+    }
+
+    urb->send_offset = urb->receive_offset;
+
+    tr_len = urb->urb->transfer_buffer_length - urb->send_offset;
+
+    spin_lock(&urb->q->vhci->msg_asynchronous_lock);
+    msg.cmd = BCE_VHCI_CMD_TRANSFER_REQUEST;
+    msg.status = 0;
+    msg.param1 = ((urb->urb->ep->desc.bEndpointAddress & 0x8Fu) << 8) | urb->q->dev_addr;
+    msg.param2 = tr_len;
+    bce_vhci_message_queue_write(&urb->q->vhci->msg_asynchronous, &msg);
+    spin_unlock(&urb->q->vhci->msg_asynchronous_lock);
+
+    s = bce_next_submission(urb->q->sq_in);
+    bce_set_submission_single(s, urb->urb->transfer_dma + urb->send_offset, tr_len);
+    bce_submit_to_device(urb->q->sq_in);
+
+    urb->state = BCE_VHCI_URB_WAITING_FOR_COMPLETION;
+    return 0;
+}
+
+static int bce_vhci_urb_data_start(struct bce_vhci_urb *urb, unsigned long *timeout)
+{
+    if (urb->dir == DMA_TO_DEVICE) {
+        if (urb->urb->transfer_buffer_length > 0)
+            urb->state = BCE_VHCI_URB_WAITING_FOR_TRANSFER_REQUEST;
+        else
+            urb->state = BCE_VHCI_URB_DATA_TRANSFER_COMPLETE;
+        return 0;
+    } else {
+        return bce_vhci_urb_data_transfer_in(urb, timeout);
+    }
+}
+
+static int bce_vhci_urb_send_out_data(struct bce_vhci_urb *urb, dma_addr_t addr, size_t size)
+{
+    struct bce_qe_submission *s;
+    unsigned long timeout = 0;
+    if (bce_reserve_submission(urb->q->sq_out, &timeout)) {
+        pr_err("bce-vhci: Failed to reserve a submission for URB data transfer\n");
+        return -EPIPE;
+    }
+
+    pr_debug("bce-vhci: [%02x] DMA to device %llx %lx\n", urb->q->endp_addr, (u64) addr, size);
+
+    s = bce_next_submission(urb->q->sq_out);
+    bce_set_submission_single(s, addr, size);
+    bce_submit_to_device(urb->q->sq_out);
+    return 0;
+}
+
+static int bce_vhci_urb_data_update(struct bce_vhci_urb *urb, struct bce_vhci_message *msg)
+{
+    u32 tr_len;
+    int status;
+    if (urb->state == BCE_VHCI_URB_WAITING_FOR_TRANSFER_REQUEST) {
+        if (msg->cmd == BCE_VHCI_CMD_TRANSFER_REQUEST) {
+            tr_len = min(urb->urb->transfer_buffer_length - urb->send_offset, (u32) msg->param2);
+            if ((status = bce_vhci_urb_send_out_data(urb, urb->urb->transfer_dma + urb->send_offset, tr_len)))
+                return status;
+            urb->send_offset += tr_len;
+            urb->state = BCE_VHCI_URB_WAITING_FOR_COMPLETION;
+            return 0;
+        }
+    }
+
+    /* 0x1000 in out queues aren't really unexpected */
+    if (msg->cmd == BCE_VHCI_CMD_TRANSFER_REQUEST && urb->q->sq_out != NULL)
+        return -EAGAIN;
+    pr_err("bce-vhci: [%02x] %s URB unexpected message (state = %x, msg: %x %x %x %llx)\n",
+            urb->q->endp_addr, (urb->is_control ? "Control (data update)" : "Data"), urb->state,
+            msg->cmd, msg->status, msg->param1, msg->param2);
+    return -EAGAIN;
+}
+
+static int bce_vhci_urb_data_transfer_completion(struct bce_vhci_urb *urb, struct bce_sq_completion_data *c)
+{
+    if (urb->state == BCE_VHCI_URB_WAITING_FOR_COMPLETION) {
+        urb->receive_offset += c->data_size;
+        if (urb->dir == DMA_FROM_DEVICE || urb->receive_offset >= urb->urb->transfer_buffer_length) {
+            urb->urb->actual_length = (u32) urb->receive_offset;
+            urb->state = BCE_VHCI_URB_DATA_TRANSFER_COMPLETE;
+            if (!urb->is_control) {
+                bce_vhci_urb_complete(urb, 0);
+                return -ENOENT;
+            }
+        }
+    } else {
+        pr_err("bce-vhci: [%02x] Data URB unexpected completion\n", urb->q->endp_addr);
+    }
+    return 0;
+}
+
+
+static int bce_vhci_urb_control_check_status(struct bce_vhci_urb *urb)
+{
+    struct bce_vhci_transfer_queue *q = urb->q;
+    if (urb->received_status == 0)
+        return 0;
+    if (urb->state == BCE_VHCI_URB_DATA_TRANSFER_COMPLETE ||
+        (urb->received_status != BCE_VHCI_SUCCESS && urb->state != BCE_VHCI_URB_CONTROL_WAITING_FOR_SETUP_REQUEST &&
+        urb->state != BCE_VHCI_URB_CONTROL_WAITING_FOR_SETUP_COMPLETION)) {
+        urb->state = BCE_VHCI_URB_CONTROL_COMPLETE;
+        if (urb->received_status != BCE_VHCI_SUCCESS) {
+            pr_err("bce-vhci: [%02x] URB failed: %x\n", urb->q->endp_addr, urb->received_status);
+            urb->q->active = false;
+            urb->q->stalled = true;
+            bce_vhci_urb_complete(urb, -EPIPE);
+            if (!list_empty(&q->endp->urb_list))
+                bce_vhci_transfer_queue_request_reset(q);
+            return -ENOENT;
+        }
+        bce_vhci_urb_complete(urb, 0);
+        return -ENOENT;
+    }
+    return 0;
+}
+
+static int bce_vhci_urb_control_update(struct bce_vhci_urb *urb, struct bce_vhci_message *msg)
+{
+    int status;
+    if (msg->cmd == BCE_VHCI_CMD_CONTROL_TRANSFER_STATUS) {
+        urb->received_status = msg->status;
+        return bce_vhci_urb_control_check_status(urb);
+    }
+
+    if (urb->state == BCE_VHCI_URB_CONTROL_WAITING_FOR_SETUP_REQUEST) {
+        if (msg->cmd == BCE_VHCI_CMD_TRANSFER_REQUEST) {
+            if (bce_vhci_urb_send_out_data(urb, urb->urb->setup_dma, sizeof(struct usb_ctrlrequest))) {
+                pr_err("bce-vhci: [%02x] Failed to start URB setup transfer\n", urb->q->endp_addr);
+                return 0; /* TODO: fail the URB? */
+            }
+            urb->state = BCE_VHCI_URB_CONTROL_WAITING_FOR_SETUP_COMPLETION;
+            pr_debug("bce-vhci: [%02x] Sent setup %llx\n", urb->q->endp_addr, urb->urb->setup_dma);
+            return 0;
+        }
+    } else if (urb->state == BCE_VHCI_URB_WAITING_FOR_TRANSFER_REQUEST ||
+               urb->state == BCE_VHCI_URB_WAITING_FOR_COMPLETION) {
+        if ((status = bce_vhci_urb_data_update(urb, msg)))
+            return status;
+        return bce_vhci_urb_control_check_status(urb);
+    }
+
+    /* 0x1000 in out queues aren't really unexpected */
+    if (msg->cmd == BCE_VHCI_CMD_TRANSFER_REQUEST && urb->q->sq_out != NULL)
+        return -EAGAIN;
+    pr_err("bce-vhci: [%02x] Control URB unexpected message (state = %x, msg: %x %x %x %llx)\n", urb->q->endp_addr,
+            urb->state, msg->cmd, msg->status, msg->param1, msg->param2);
+    return -EAGAIN;
+}
+
+static int bce_vhci_urb_control_transfer_completion(struct bce_vhci_urb *urb, struct bce_sq_completion_data *c)
+{
+    int status;
+    unsigned long timeout;
+
+    if (urb->state == BCE_VHCI_URB_CONTROL_WAITING_FOR_SETUP_COMPLETION) {
+        if (c->data_size != sizeof(struct usb_ctrlrequest))
+            pr_err("bce-vhci: [%02x] transfer complete data size mistmatch for usb_ctrlrequest (%llx instead of %lx)\n",
+                   urb->q->endp_addr, c->data_size, sizeof(struct usb_ctrlrequest));
+
+        timeout = 1000;
+        status = bce_vhci_urb_data_start(urb, &timeout);
+        if (status) {
+            bce_vhci_urb_complete(urb, status);
+            return -ENOENT;
+        }
+        return 0;
+    } else if (urb->state == BCE_VHCI_URB_WAITING_FOR_TRANSFER_REQUEST ||
+               urb->state == BCE_VHCI_URB_WAITING_FOR_COMPLETION) {
+        if ((status = bce_vhci_urb_data_transfer_completion(urb, c)))
+            return status;
+        return bce_vhci_urb_control_check_status(urb);
+    } else {
+        pr_err("bce-vhci: [%02x] Control URB unexpected completion (state = %x)\n", urb->q->endp_addr, urb->state);
+    }
+    return 0;
+}
+
+static int bce_vhci_urb_update(struct bce_vhci_urb *urb, struct bce_vhci_message *msg)
+{
+    if (urb->state == BCE_VHCI_URB_INIT_PENDING)
+        return -EAGAIN;
+    if (urb->is_control)
+        return bce_vhci_urb_control_update(urb, msg);
+    else
+        return bce_vhci_urb_data_update(urb, msg);
+}
+
+static int bce_vhci_urb_transfer_completion(struct bce_vhci_urb *urb, struct bce_sq_completion_data *c)
+{
+    if (urb->is_control)
+        return bce_vhci_urb_control_transfer_completion(urb, c);
+    else
+        return bce_vhci_urb_data_transfer_completion(urb, c);
+}
+
+static void bce_vhci_urb_resume(struct bce_vhci_urb *urb)
+{
+    int status = 0;
+    if (urb->state == BCE_VHCI_URB_WAITING_FOR_COMPLETION) {
+        status = bce_vhci_urb_data_transfer_in(urb, NULL);
+    }
+    if (status)
+        bce_vhci_urb_complete(urb, status);
+}
diff --git a/drivers/staging/apple-bce/vhci/transfer.h b/drivers/staging/apple-bce/vhci/transfer.h
new file mode 100644
index 000000000000..89ecad6bcf8f
--- /dev/null
+++ b/drivers/staging/apple-bce/vhci/transfer.h
@@ -0,0 +1,73 @@
+#ifndef BCEDRIVER_TRANSFER_H
+#define BCEDRIVER_TRANSFER_H
+
+#include <linux/usb.h>
+#include "queue.h"
+#include "command.h"
+#include "../queue.h"
+
+struct bce_vhci_list_message {
+    struct list_head list;
+    struct bce_vhci_message msg;
+};
+enum bce_vhci_pause_source {
+    BCE_VHCI_PAUSE_INTERNAL_WQ = 1,
+    BCE_VHCI_PAUSE_FIRMWARE = 2,
+    BCE_VHCI_PAUSE_SUSPEND = 4,
+    BCE_VHCI_PAUSE_SHUTDOWN = 8
+};
+struct bce_vhci_transfer_queue {
+    struct bce_vhci *vhci;
+    struct usb_host_endpoint *endp;
+    enum bce_vhci_endpoint_state state;
+    u32 max_active_requests, remaining_active_requests;
+    bool active, stalled;
+    u32 paused_by;
+    bce_vhci_device_t dev_addr;
+    u8 endp_addr;
+    struct bce_queue_cq *cq;
+    struct bce_queue_sq *sq_in;
+    struct bce_queue_sq *sq_out;
+    struct list_head evq;
+    struct spinlock urb_lock;
+    struct mutex pause_lock;
+    struct list_head giveback_urb_list;
+
+    struct work_struct w_reset;
+};
+enum bce_vhci_urb_state {
+    BCE_VHCI_URB_INIT_PENDING,
+
+    BCE_VHCI_URB_WAITING_FOR_TRANSFER_REQUEST,
+    BCE_VHCI_URB_WAITING_FOR_COMPLETION,
+    BCE_VHCI_URB_DATA_TRANSFER_COMPLETE,
+
+    BCE_VHCI_URB_CONTROL_WAITING_FOR_SETUP_REQUEST,
+    BCE_VHCI_URB_CONTROL_WAITING_FOR_SETUP_COMPLETION,
+    BCE_VHCI_URB_CONTROL_COMPLETE
+};
+struct bce_vhci_urb {
+    struct urb *urb;
+    struct bce_vhci_transfer_queue *q;
+    enum dma_data_direction dir;
+    bool is_control;
+    enum bce_vhci_urb_state state;
+    int received_status;
+    u32 send_offset;
+    u32 receive_offset;
+};
+
+void bce_vhci_create_transfer_queue(struct bce_vhci *vhci, struct bce_vhci_transfer_queue *q,
+        struct usb_host_endpoint *endp, bce_vhci_device_t dev_addr, enum dma_data_direction dir);
+void bce_vhci_destroy_transfer_queue(struct bce_vhci *vhci, struct bce_vhci_transfer_queue *q);
+void bce_vhci_transfer_queue_event(struct bce_vhci_transfer_queue *q, struct bce_vhci_message *msg);
+int bce_vhci_transfer_queue_do_pause(struct bce_vhci_transfer_queue *q);
+int bce_vhci_transfer_queue_do_resume(struct bce_vhci_transfer_queue *q);
+int bce_vhci_transfer_queue_pause(struct bce_vhci_transfer_queue *q, enum bce_vhci_pause_source src);
+int bce_vhci_transfer_queue_resume(struct bce_vhci_transfer_queue *q, enum bce_vhci_pause_source src);
+void bce_vhci_transfer_queue_request_reset(struct bce_vhci_transfer_queue *q);
+
+int bce_vhci_urb_create(struct bce_vhci_transfer_queue *q, struct urb *urb);
+int bce_vhci_urb_request_cancel(struct bce_vhci_transfer_queue *q, struct urb *urb, int status);
+
+#endif //BCEDRIVER_TRANSFER_H
diff --git a/drivers/staging/apple-bce/vhci/vhci.c b/drivers/staging/apple-bce/vhci/vhci.c
new file mode 100644
index 000000000000..eb26f55000d8
--- /dev/null
+++ b/drivers/staging/apple-bce/vhci/vhci.c
@@ -0,0 +1,759 @@
+#include "vhci.h"
+#include "../apple_bce.h"
+#include "command.h"
+#include <linux/usb.h>
+#include <linux/usb/hcd.h>
+#include <linux/module.h>
+#include <linux/version.h>
+
+static dev_t bce_vhci_chrdev;
+static struct class *bce_vhci_class;
+static const struct hc_driver bce_vhci_driver;
+static u16 bce_vhci_port_mask = U16_MAX;
+
+static int bce_vhci_create_event_queues(struct bce_vhci *vhci);
+static void bce_vhci_destroy_event_queues(struct bce_vhci *vhci);
+static int bce_vhci_create_message_queues(struct bce_vhci *vhci);
+static void bce_vhci_destroy_message_queues(struct bce_vhci *vhci);
+static void bce_vhci_handle_firmware_events_w(struct work_struct *ws);
+static void bce_vhci_firmware_event_completion(struct bce_queue_sq *sq);
+
+int bce_vhci_create(struct apple_bce_device *dev, struct bce_vhci *vhci)
+{
+    int status;
+
+    spin_lock_init(&vhci->hcd_spinlock);
+
+    vhci->dev = dev;
+
+    vhci->vdevt = bce_vhci_chrdev;
+    vhci->vdev = device_create(bce_vhci_class, dev->dev, vhci->vdevt, NULL, "bce-vhci");
+    if (IS_ERR_OR_NULL(vhci->vdev)) {
+        status = PTR_ERR(vhci->vdev);
+        goto fail_dev;
+    }
+
+    if ((status = bce_vhci_create_message_queues(vhci)))
+        goto fail_mq;
+    if ((status = bce_vhci_create_event_queues(vhci)))
+        goto fail_eq;
+
+    vhci->tq_state_wq = alloc_ordered_workqueue("bce-vhci-tq-state", 0);
+    INIT_WORK(&vhci->w_fw_events, bce_vhci_handle_firmware_events_w);
+
+    vhci->hcd = usb_create_hcd(&bce_vhci_driver, vhci->vdev, "bce-vhci");
+    if (!vhci->hcd) {
+        status = -ENOMEM;
+        goto fail_hcd;
+    }
+    vhci->hcd->self.sysdev = &dev->pci->dev;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,4,0)
+    vhci->hcd->self.uses_dma = 1;
+#endif
+    *((struct bce_vhci **) vhci->hcd->hcd_priv) = vhci;
+    vhci->hcd->speed = HCD_USB2;
+
+    if ((status = usb_add_hcd(vhci->hcd, 0, 0)))
+        goto fail_hcd;
+
+    return 0;
+
+fail_hcd:
+    bce_vhci_destroy_event_queues(vhci);
+fail_eq:
+    bce_vhci_destroy_message_queues(vhci);
+fail_mq:
+    device_destroy(bce_vhci_class, vhci->vdevt);
+fail_dev:
+    if (!status)
+        status = -EINVAL;
+    return status;
+}
+
+void bce_vhci_destroy(struct bce_vhci *vhci)
+{
+    usb_remove_hcd(vhci->hcd);
+    bce_vhci_destroy_event_queues(vhci);
+    bce_vhci_destroy_message_queues(vhci);
+    device_destroy(bce_vhci_class, vhci->vdevt);
+}
+
+struct bce_vhci *bce_vhci_from_hcd(struct usb_hcd *hcd)
+{
+    return *((struct bce_vhci **) hcd->hcd_priv);
+}
+
+int bce_vhci_start(struct usb_hcd *hcd)
+{
+    struct bce_vhci *vhci = bce_vhci_from_hcd(hcd);
+    int status;
+    u16 port_mask = 0;
+    bce_vhci_port_t port_no = 0;
+    if ((status = bce_vhci_cmd_controller_enable(&vhci->cq, 1, &port_mask)))
+        return status;
+    vhci->port_mask = port_mask;
+    vhci->port_power_mask = 0;
+    if ((status = bce_vhci_cmd_controller_start(&vhci->cq)))
+        return status;
+    port_mask = vhci->port_mask;
+    while (port_mask) {
+        port_no += 1;
+        port_mask >>= 1;
+    }
+    vhci->port_count = port_no;
+    return 0;
+}
+
+void bce_vhci_stop(struct usb_hcd *hcd)
+{
+    struct bce_vhci *vhci = bce_vhci_from_hcd(hcd);
+    bce_vhci_cmd_controller_disable(&vhci->cq);
+}
+
+static int bce_vhci_hub_status_data(struct usb_hcd *hcd, char *buf)
+{
+    return 0;
+}
+
+static int bce_vhci_reset_device(struct bce_vhci *vhci, int index, u16 timeout);
+
+static int bce_vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buf, u16 wLength)
+{
+    struct bce_vhci *vhci = bce_vhci_from_hcd(hcd);
+    int status;
+    struct usb_hub_descriptor *hd;
+    struct usb_hub_status *hs;
+    struct usb_port_status *ps;
+    u32 port_status;
+    // pr_info("bce-vhci: bce_vhci_hub_control %x %i %i [bufl=%i]\n", typeReq, wValue, wIndex, wLength);
+    if (typeReq == GetHubDescriptor && wLength >= sizeof(struct usb_hub_descriptor)) {
+        hd = (struct usb_hub_descriptor *) buf;
+        memset(hd, 0, sizeof(*hd));
+        hd->bDescLength = sizeof(struct usb_hub_descriptor);
+        hd->bDescriptorType = USB_DT_HUB;
+        hd->bNbrPorts = (u8) vhci->port_count;
+        hd->wHubCharacteristics = HUB_CHAR_INDV_PORT_LPSM | HUB_CHAR_INDV_PORT_OCPM;
+        hd->bPwrOn2PwrGood = 0;
+        hd->bHubContrCurrent = 0;
+        return 0;
+    } else if (typeReq == GetHubStatus && wLength >= sizeof(struct usb_hub_status)) {
+        hs = (struct usb_hub_status *) buf;
+        memset(hs, 0, sizeof(*hs));
+        hs->wHubStatus = 0;
+        hs->wHubChange = 0;
+        return 0;
+    } else if (typeReq == GetPortStatus && wLength >= 4 /* usb 2.0 */) {
+        ps = (struct usb_port_status *) buf;
+        ps->wPortStatus = 0;
+        ps->wPortChange = 0;
+
+        if (vhci->port_power_mask & BIT(wIndex))
+            ps->wPortStatus |= USB_PORT_STAT_POWER;
+
+        if (!(bce_vhci_port_mask & BIT(wIndex)))
+            return 0;
+
+        if ((status = bce_vhci_cmd_port_status(&vhci->cq, (u8) wIndex, 0, &port_status)))
+            return status;
+
+        if (port_status & 16)
+            ps->wPortStatus |= USB_PORT_STAT_ENABLE | USB_PORT_STAT_HIGH_SPEED;
+        if (port_status & 4)
+            ps->wPortStatus |= USB_PORT_STAT_CONNECTION;
+        if (port_status & 2)
+            ps->wPortStatus |= USB_PORT_STAT_OVERCURRENT;
+        if (port_status & 8)
+            ps->wPortStatus |= USB_PORT_STAT_RESET;
+        if (port_status & 0x60)
+            ps->wPortStatus |= USB_PORT_STAT_SUSPEND;
+
+        if (port_status & 0x40000)
+            ps->wPortChange |= USB_PORT_STAT_C_CONNECTION;
+
+        pr_debug("bce-vhci: Translated status %x to %x:%x\n", port_status, ps->wPortStatus, ps->wPortChange);
+        return 0;
+    } else if (typeReq == SetPortFeature) {
+        if (wValue == USB_PORT_FEAT_POWER) {
+            status = bce_vhci_cmd_port_power_on(&vhci->cq, (u8) wIndex);
+            /* As far as I am aware, power status is not part of the port status so store it separately */
+            if (!status)
+                vhci->port_power_mask |= BIT(wIndex);
+            return status;
+        }
+        if (wValue == USB_PORT_FEAT_RESET) {
+            return bce_vhci_reset_device(vhci, wIndex, wValue);
+        }
+        if (wValue == USB_PORT_FEAT_SUSPEND) {
+            /* TODO: Am I supposed to also suspend the endpoints? */
+            pr_debug("bce-vhci: Suspending port %i\n", wIndex);
+            return bce_vhci_cmd_port_suspend(&vhci->cq, (u8) wIndex);
+        }
+    } else if (typeReq == ClearPortFeature) {
+        if (wValue == USB_PORT_FEAT_ENABLE)
+            return bce_vhci_cmd_port_disable(&vhci->cq, (u8) wIndex);
+        if (wValue == USB_PORT_FEAT_POWER) {
+            status = bce_vhci_cmd_port_power_off(&vhci->cq, (u8) wIndex);
+            if (!status)
+                vhci->port_power_mask &= ~BIT(wIndex);
+            return status;
+        }
+        if (wValue == USB_PORT_FEAT_C_CONNECTION)
+            return bce_vhci_cmd_port_status(&vhci->cq, (u8) wIndex, 0x40000, &port_status);
+        if (wValue == USB_PORT_FEAT_C_RESET) { /* I don't think I can transfer it in any way */
+            return 0;
+        }
+        if (wValue == USB_PORT_FEAT_SUSPEND) {
+            pr_debug("bce-vhci: Resuming port %i\n", wIndex);
+            return bce_vhci_cmd_port_resume(&vhci->cq, (u8) wIndex);
+        }
+    }
+    pr_err("bce-vhci: bce_vhci_hub_control unhandled request: %x %i %i [bufl=%i]\n", typeReq, wValue, wIndex, wLength);
+    dump_stack();
+    return -EIO;
+}
+
+static int bce_vhci_enable_device(struct usb_hcd *hcd, struct usb_device *udev)
+{
+    struct bce_vhci *vhci = bce_vhci_from_hcd(hcd);
+    struct bce_vhci_device *vdev;
+    bce_vhci_device_t devid;
+    pr_info("bce_vhci_enable_device\n");
+
+    if (vhci->port_to_device[udev->portnum])
+        return 0;
+
+    /* We need to early address the device */
+    if (bce_vhci_cmd_device_create(&vhci->cq, udev->portnum, &devid))
+        return -EIO;
+
+    pr_info("bce_vhci_cmd_device_create %i -> %i\n", udev->portnum, devid);
+
+    vdev = kzalloc(sizeof(struct bce_vhci_device), GFP_KERNEL);
+    vhci->port_to_device[udev->portnum] = devid;
+    vhci->devices[devid] = vdev;
+
+    bce_vhci_create_transfer_queue(vhci, &vdev->tq[0], &udev->ep0, devid, DMA_BIDIRECTIONAL);
+    udev->ep0.hcpriv = &vdev->tq[0];
+    vdev->tq_mask |= BIT(0);
+
+    bce_vhci_cmd_endpoint_create(&vhci->cq, devid, &udev->ep0.desc);
+    return 0;
+}
+
+static int bce_vhci_address_device(struct usb_hcd *hcd, struct usb_device *udev, unsigned int timeout_ms) //TODO: follow timeout
+{
+    /* This is the same as enable_device, but instead in the old scheme */
+    return bce_vhci_enable_device(hcd, udev);
+}
+
+static void bce_vhci_free_device(struct usb_hcd *hcd, struct usb_device *udev)
+{
+    struct bce_vhci *vhci = bce_vhci_from_hcd(hcd);
+    int i;
+    bce_vhci_device_t devid;
+    struct bce_vhci_device *dev;
+    pr_info("bce_vhci_free_device %i\n", udev->portnum);
+    if (!vhci->port_to_device[udev->portnum])
+        return;
+    devid = vhci->port_to_device[udev->portnum];
+    dev = vhci->devices[devid];
+    for (i = 0; i < 32; i++) {
+        if (dev->tq_mask & BIT(i)) {
+            bce_vhci_transfer_queue_pause(&dev->tq[i], BCE_VHCI_PAUSE_SHUTDOWN);
+            bce_vhci_cmd_endpoint_destroy(&vhci->cq, devid, (u8) i);
+            bce_vhci_destroy_transfer_queue(vhci, &dev->tq[i]);
+        }
+    }
+    vhci->devices[devid] = NULL;
+    vhci->port_to_device[udev->portnum] = 0;
+    bce_vhci_cmd_device_destroy(&vhci->cq, devid);
+    kfree(dev);
+}
+
+static int bce_vhci_reset_device(struct bce_vhci *vhci, int index, u16 timeout)
+{
+    struct bce_vhci_device *dev = NULL;
+    bce_vhci_device_t devid;
+    int i;
+    int status;
+    enum dma_data_direction dir;
+    pr_info("bce_vhci_reset_device %i\n", index);
+
+    devid = vhci->port_to_device[index];
+    if (devid) {
+        dev = vhci->devices[devid];
+
+        for (i = 0; i < 32; i++) {
+            if (dev->tq_mask & BIT(i)) {
+                bce_vhci_transfer_queue_pause(&dev->tq[i], BCE_VHCI_PAUSE_SHUTDOWN);
+                bce_vhci_cmd_endpoint_destroy(&vhci->cq, devid, (u8) i);
+                bce_vhci_destroy_transfer_queue(vhci, &dev->tq[i]);
+            }
+        }
+        vhci->devices[devid] = NULL;
+        vhci->port_to_device[index] = 0;
+        bce_vhci_cmd_device_destroy(&vhci->cq, devid);
+    }
+    status = bce_vhci_cmd_port_reset(&vhci->cq, (u8) index, timeout);
+
+    if (dev) {
+        if ((status = bce_vhci_cmd_device_create(&vhci->cq, index, &devid)))
+            return status;
+        vhci->devices[devid] = dev;
+        vhci->port_to_device[index] = devid;
+
+        for (i = 0; i < 32; i++) {
+            if (dev->tq_mask & BIT(i)) {
+                dir = usb_endpoint_dir_in(&dev->tq[i].endp->desc) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+                if (i == 0)
+                    dir = DMA_BIDIRECTIONAL;
+                bce_vhci_create_transfer_queue(vhci, &dev->tq[i], dev->tq[i].endp, devid, dir);
+                bce_vhci_cmd_endpoint_create(&vhci->cq, devid, &dev->tq[i].endp->desc);
+            }
+        }
+    }
+
+    return status;
+}
+
+static int bce_vhci_check_bandwidth(struct usb_hcd *hcd, struct usb_device *udev)
+{
+    return 0;
+}
+
+static int bce_vhci_get_frame_number(struct usb_hcd *hcd)
+{
+    return 0;
+}
+
+static int bce_vhci_bus_suspend(struct usb_hcd *hcd)
+{
+    int i, j;
+    int status;
+    struct bce_vhci *vhci = bce_vhci_from_hcd(hcd);
+    pr_info("bce_vhci: suspend started\n");
+
+    pr_info("bce_vhci: suspend endpoints\n");
+    for (i = 0; i < 16; i++) {
+        if (!vhci->port_to_device[i])
+            continue;
+        for (j = 0; j < 32; j++) {
+            if (!(vhci->devices[vhci->port_to_device[i]]->tq_mask & BIT(j)))
+                continue;
+            bce_vhci_transfer_queue_pause(&vhci->devices[vhci->port_to_device[i]]->tq[j],
+                    BCE_VHCI_PAUSE_SUSPEND);
+        }
+    }
+
+    pr_info("bce_vhci: suspend ports\n");
+    for (i = 0; i < 16; i++) {
+        if (!vhci->port_to_device[i])
+            continue;
+        bce_vhci_cmd_port_suspend(&vhci->cq, i);
+    }
+    pr_info("bce_vhci: suspend controller\n");
+    if ((status = bce_vhci_cmd_controller_pause(&vhci->cq)))
+        return status;
+
+    bce_vhci_event_queue_pause(&vhci->ev_commands);
+    bce_vhci_event_queue_pause(&vhci->ev_system);
+    bce_vhci_event_queue_pause(&vhci->ev_isochronous);
+    bce_vhci_event_queue_pause(&vhci->ev_interrupt);
+    bce_vhci_event_queue_pause(&vhci->ev_asynchronous);
+    pr_info("bce_vhci: suspend done\n");
+    return 0;
+}
+
+static int bce_vhci_bus_resume(struct usb_hcd *hcd)
+{
+    int i, j;
+    int status;
+    struct bce_vhci *vhci = bce_vhci_from_hcd(hcd);
+    pr_info("bce_vhci: resume started\n");
+
+    bce_vhci_event_queue_resume(&vhci->ev_system);
+    bce_vhci_event_queue_resume(&vhci->ev_isochronous);
+    bce_vhci_event_queue_resume(&vhci->ev_interrupt);
+    bce_vhci_event_queue_resume(&vhci->ev_asynchronous);
+    bce_vhci_event_queue_resume(&vhci->ev_commands);
+
+    pr_info("bce_vhci: resume controller\n");
+    if ((status = bce_vhci_cmd_controller_start(&vhci->cq)))
+        return status;
+
+    pr_info("bce_vhci: resume ports\n");
+    for (i = 0; i < 16; i++) {
+        if (!vhci->port_to_device[i])
+            continue;
+        bce_vhci_cmd_port_resume(&vhci->cq, i);
+    }
+    pr_info("bce_vhci: resume endpoints\n");
+    for (i = 0; i < 16; i++) {
+        if (!vhci->port_to_device[i])
+            continue;
+        for (j = 0; j < 32; j++) {
+            if (!(vhci->devices[vhci->port_to_device[i]]->tq_mask & BIT(j)))
+                continue;
+            bce_vhci_transfer_queue_resume(&vhci->devices[vhci->port_to_device[i]]->tq[j],
+                    BCE_VHCI_PAUSE_SUSPEND);
+        }
+    }
+
+    pr_info("bce_vhci: resume done\n");
+    return 0;
+}
+
+static int bce_vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags)
+{
+    struct bce_vhci_transfer_queue *q = urb->ep->hcpriv;
+    pr_debug("bce_vhci_urb_enqueue %i:%x\n", q->dev_addr, urb->ep->desc.bEndpointAddress);
+    if (!q)
+        return -ENOENT;
+    return bce_vhci_urb_create(q, urb);
+}
+
+static int bce_vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
+{
+    struct bce_vhci_transfer_queue *q = urb->ep->hcpriv;
+    pr_debug("bce_vhci_urb_dequeue %x\n", urb->ep->desc.bEndpointAddress);
+    return bce_vhci_urb_request_cancel(q, urb, status);
+}
+
+static void bce_vhci_endpoint_reset(struct usb_hcd *hcd, struct usb_host_endpoint *ep)
+{
+    struct bce_vhci_transfer_queue *q = ep->hcpriv;
+    pr_debug("bce_vhci_endpoint_reset\n");
+    if (q)
+        bce_vhci_transfer_queue_request_reset(q);
+}
+
+static u8 bce_vhci_endpoint_index(u8 addr)
+{
+    if (addr & 0x80)
+        return (u8) (0x10 + (addr & 0xf));
+    return (u8) (addr & 0xf);
+}
+
+static int bce_vhci_add_endpoint(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint *endp)
+{
+    u8 endp_index = bce_vhci_endpoint_index(endp->desc.bEndpointAddress);
+    struct bce_vhci *vhci = bce_vhci_from_hcd(hcd);
+    bce_vhci_device_t devid = vhci->port_to_device[udev->portnum];
+    struct bce_vhci_device *vdev = vhci->devices[devid];
+    pr_debug("bce_vhci_add_endpoint %x/%x:%x\n", udev->portnum, devid, endp_index);
+
+    if (udev->bus->root_hub == udev) /* The USB hub */
+        return 0;
+    if (vdev == NULL)
+        return -ENODEV;
+    if (vdev->tq_mask & BIT(endp_index)) {
+        endp->hcpriv = &vdev->tq[endp_index];
+        return 0;
+    }
+
+    bce_vhci_create_transfer_queue(vhci, &vdev->tq[endp_index], endp, devid,
+            usb_endpoint_dir_in(&endp->desc) ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+    endp->hcpriv = &vdev->tq[endp_index];
+    vdev->tq_mask |= BIT(endp_index);
+
+    bce_vhci_cmd_endpoint_create(&vhci->cq, devid, &endp->desc);
+    return 0;
+}
+
+static int bce_vhci_drop_endpoint(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint *endp)
+{
+    u8 endp_index = bce_vhci_endpoint_index(endp->desc.bEndpointAddress);
+    struct bce_vhci *vhci = bce_vhci_from_hcd(hcd);
+    bce_vhci_device_t devid = vhci->port_to_device[udev->portnum];
+    struct bce_vhci_transfer_queue *q = endp->hcpriv;
+    struct bce_vhci_device *vdev = vhci->devices[devid];
+    pr_info("bce_vhci_drop_endpoint %x:%x\n", udev->portnum, endp_index);
+    if (!q) {
+        if (vdev && vdev->tq_mask & BIT(endp_index)) {
+            pr_err("something deleted the hcpriv?\n");
+            q = &vdev->tq[endp_index];
+        } else {
+            return 0;
+        }
+    }
+
+    bce_vhci_cmd_endpoint_destroy(&vhci->cq, devid, (u8) (endp->desc.bEndpointAddress & 0x8Fu));
+    vhci->devices[devid]->tq_mask &= ~BIT(endp_index);
+    bce_vhci_destroy_transfer_queue(vhci, q);
+    return 0;
+}
+
+static int bce_vhci_create_message_queues(struct bce_vhci *vhci)
+{
+    if (bce_vhci_message_queue_create(vhci, &vhci->msg_commands, "VHC1HostCommands") ||
+        bce_vhci_message_queue_create(vhci, &vhci->msg_system, "VHC1HostSystemEvents") ||
+        bce_vhci_message_queue_create(vhci, &vhci->msg_isochronous, "VHC1HostIsochronousEvents") ||
+        bce_vhci_message_queue_create(vhci, &vhci->msg_interrupt, "VHC1HostInterruptEvents") ||
+        bce_vhci_message_queue_create(vhci, &vhci->msg_asynchronous, "VHC1HostAsynchronousEvents")) {
+        bce_vhci_destroy_message_queues(vhci);
+        return -EINVAL;
+    }
+    spin_lock_init(&vhci->msg_asynchronous_lock);
+    bce_vhci_command_queue_create(&vhci->cq, &vhci->msg_commands);
+    return 0;
+}
+
+static void bce_vhci_destroy_message_queues(struct bce_vhci *vhci)
+{
+    bce_vhci_command_queue_destroy(&vhci->cq);
+    bce_vhci_message_queue_destroy(vhci, &vhci->msg_commands);
+    bce_vhci_message_queue_destroy(vhci, &vhci->msg_system);
+    bce_vhci_message_queue_destroy(vhci, &vhci->msg_isochronous);
+    bce_vhci_message_queue_destroy(vhci, &vhci->msg_interrupt);
+    bce_vhci_message_queue_destroy(vhci, &vhci->msg_asynchronous);
+}
+
+static void bce_vhci_handle_system_event(struct bce_vhci_event_queue *q, struct bce_vhci_message *msg);
+static void bce_vhci_handle_usb_event(struct bce_vhci_event_queue *q, struct bce_vhci_message *msg);
+
+static int bce_vhci_create_event_queues(struct bce_vhci *vhci)
+{
+    vhci->ev_cq = bce_create_cq(vhci->dev, 0x100);
+    if (!vhci->ev_cq)
+        return -EINVAL;
+#define CREATE_EVENT_QUEUE(field, name, cb) bce_vhci_event_queue_create(vhci, &vhci->field, name, cb)
+    if (__bce_vhci_event_queue_create(vhci, &vhci->ev_commands, "VHC1FirmwareCommands",
+            bce_vhci_firmware_event_completion) ||
+        CREATE_EVENT_QUEUE(ev_system,       "VHC1FirmwareSystemEvents",       bce_vhci_handle_system_event) ||
+        CREATE_EVENT_QUEUE(ev_isochronous,  "VHC1FirmwareIsochronousEvents",  bce_vhci_handle_usb_event) ||
+        CREATE_EVENT_QUEUE(ev_interrupt,    "VHC1FirmwareInterruptEvents",    bce_vhci_handle_usb_event) ||
+        CREATE_EVENT_QUEUE(ev_asynchronous, "VHC1FirmwareAsynchronousEvents", bce_vhci_handle_usb_event)) {
+        bce_vhci_destroy_event_queues(vhci);
+        return -EINVAL;
+    }
+#undef CREATE_EVENT_QUEUE
+    return 0;
+}
+
+static void bce_vhci_destroy_event_queues(struct bce_vhci *vhci)
+{
+    bce_vhci_event_queue_destroy(vhci, &vhci->ev_commands);
+    bce_vhci_event_queue_destroy(vhci, &vhci->ev_system);
+    bce_vhci_event_queue_destroy(vhci, &vhci->ev_isochronous);
+    bce_vhci_event_queue_destroy(vhci, &vhci->ev_interrupt);
+    bce_vhci_event_queue_destroy(vhci, &vhci->ev_asynchronous);
+    if (vhci->ev_cq)
+        bce_destroy_cq(vhci->dev, vhci->ev_cq);
+}
+
+static void bce_vhci_send_fw_event_response(struct bce_vhci *vhci, struct bce_vhci_message *req, u16 status)
+{
+    unsigned long timeout = 1000;
+    struct bce_vhci_message r = *req;
+    r.cmd = (u16) (req->cmd | 0x8000u);
+    r.status = status;
+    r.param1 = req->param1;
+    r.param2 = 0;
+
+    if (bce_reserve_submission(vhci->msg_system.sq, &timeout)) {
+        pr_err("bce-vhci: Cannot reserve submision for FW event reply\n");
+        return;
+    }
+    bce_vhci_message_queue_write(&vhci->msg_system, &r);
+}
+
+static int bce_vhci_handle_firmware_event(struct bce_vhci *vhci, struct bce_vhci_message *msg)
+{
+    unsigned long flags;
+    bce_vhci_device_t devid;
+    u8 endp;
+    struct bce_vhci_device *dev;
+    struct bce_vhci_transfer_queue *tq;
+    if (msg->cmd == BCE_VHCI_CMD_ENDPOINT_REQUEST_STATE || msg->cmd == BCE_VHCI_CMD_ENDPOINT_SET_STATE) {
+        devid = (bce_vhci_device_t) (msg->param1 & 0xff);
+        endp = bce_vhci_endpoint_index((u8) ((msg->param1 >> 8) & 0xff));
+        dev = vhci->devices[devid];
+        if (!dev || !(dev->tq_mask & BIT(endp)))
+            return BCE_VHCI_BAD_ARGUMENT;
+        tq = &dev->tq[endp];
+    }
+
+    if (msg->cmd == BCE_VHCI_CMD_ENDPOINT_REQUEST_STATE) {
+        if (msg->param2 == BCE_VHCI_ENDPOINT_ACTIVE) {
+            bce_vhci_transfer_queue_resume(tq, BCE_VHCI_PAUSE_FIRMWARE);
+            return BCE_VHCI_SUCCESS;
+        } else if (msg->param2 == BCE_VHCI_ENDPOINT_PAUSED) {
+            bce_vhci_transfer_queue_pause(tq, BCE_VHCI_PAUSE_FIRMWARE);
+            return BCE_VHCI_SUCCESS;
+        }
+        return BCE_VHCI_BAD_ARGUMENT;
+    } else if (msg->cmd == BCE_VHCI_CMD_ENDPOINT_SET_STATE) {
+        if (msg->param2 == BCE_VHCI_ENDPOINT_STALLED) {
+            tq->state = msg->param2;
+            spin_lock_irqsave(&tq->urb_lock, flags);
+            tq->stalled = true;
+            spin_unlock_irqrestore(&tq->urb_lock, flags);
+            return BCE_VHCI_SUCCESS;
+        }
+        return BCE_VHCI_BAD_ARGUMENT;
+    }
+    pr_warn("bce-vhci: Unhandled firmware event: %x s=%x p1=%x p2=%llx\n",
+            msg->cmd, msg->status, msg->param1, msg->param2);
+    return BCE_VHCI_BAD_ARGUMENT;
+}
+
+static void bce_vhci_handle_firmware_events_w(struct work_struct *ws)
+{
+    size_t cnt = 0;
+    int result;
+    struct bce_vhci *vhci = container_of(ws, struct bce_vhci, w_fw_events);
+    struct bce_queue_sq *sq = vhci->ev_commands.sq;
+    struct bce_sq_completion_data *cq;
+    struct bce_vhci_message *msg, *msg2 = NULL;
+
+    while (true) {
+        if (msg2) {
+            msg = msg2;
+            msg2 = NULL;
+        } else if ((cq = bce_next_completion(sq))) {
+            if (cq->status == BCE_COMPLETION_ABORTED) {
+                bce_notify_submission_complete(sq);
+                continue;
+            }
+            msg = &vhci->ev_commands.data[sq->head];
+        } else {
+            break;
+        }
+
+        pr_debug("bce-vhci: Got fw event: %x s=%x p1=%x p2=%llx\n", msg->cmd, msg->status, msg->param1, msg->param2);
+        if ((cq = bce_next_completion(sq))) {
+            msg2 = &vhci->ev_commands.data[(sq->head + 1) % sq->el_count];
+            pr_debug("bce-vhci: Got second fw event: %x s=%x p1=%x p2=%llx\n",
+                    msg->cmd, msg->status, msg->param1, msg->param2);
+            if (cq->status != BCE_COMPLETION_ABORTED &&
+                msg2->cmd == (msg->cmd | 0x4000) && msg2->param1 == msg->param1) {
+                /* Take two elements */
+                pr_debug("bce-vhci: Cancelled\n");
+                bce_vhci_send_fw_event_response(vhci, msg, BCE_VHCI_ABORT);
+
+                bce_notify_submission_complete(sq);
+                bce_notify_submission_complete(sq);
+                msg2 = NULL;
+                cnt += 2;
+                continue;
+            }
+
+            pr_warn("bce-vhci: Handle fw event - unexpected cancellation\n");
+        }
+
+        result = bce_vhci_handle_firmware_event(vhci, msg);
+        bce_vhci_send_fw_event_response(vhci, msg, (u16) result);
+
+
+        bce_notify_submission_complete(sq);
+        ++cnt;
+    }
+    bce_vhci_event_queue_submit_pending(&vhci->ev_commands, cnt);
+    if (atomic_read(&sq->available_commands) == sq->el_count - 1) {
+        pr_debug("bce-vhci: complete\n");
+        complete(&vhci->ev_commands.queue_empty_completion);
+    }
+}
+
+static void bce_vhci_firmware_event_completion(struct bce_queue_sq *sq)
+{
+    struct bce_vhci_event_queue *q = sq->userdata;
+    queue_work(q->vhci->tq_state_wq, &q->vhci->w_fw_events);
+}
+
+static void bce_vhci_handle_system_event(struct bce_vhci_event_queue *q, struct bce_vhci_message *msg)
+{
+    if (msg->cmd & 0x8000) {
+        bce_vhci_command_queue_deliver_completion(&q->vhci->cq, msg);
+    } else {
+        pr_warn("bce-vhci: Unhandled system event: %x s=%x p1=%x p2=%llx\n",
+                msg->cmd, msg->status, msg->param1, msg->param2);
+    }
+}
+
+static void bce_vhci_handle_usb_event(struct bce_vhci_event_queue *q, struct bce_vhci_message *msg)
+{
+    bce_vhci_device_t devid;
+    u8 endp;
+    struct bce_vhci_device *dev;
+    if (msg->cmd & 0x8000) {
+        bce_vhci_command_queue_deliver_completion(&q->vhci->cq, msg);
+    } else if (msg->cmd == BCE_VHCI_CMD_TRANSFER_REQUEST || msg->cmd == BCE_VHCI_CMD_CONTROL_TRANSFER_STATUS) {
+        devid = (bce_vhci_device_t) (msg->param1 & 0xff);
+        endp = bce_vhci_endpoint_index((u8) ((msg->param1 >> 8) & 0xff));
+        dev = q->vhci->devices[devid];
+        if (!dev || (dev->tq_mask & BIT(endp)) == 0) {
+            pr_err("bce-vhci: Didn't find destination for transfer queue event\n");
+            return;
+        }
+        bce_vhci_transfer_queue_event(&dev->tq[endp], msg);
+    } else {
+        pr_warn("bce-vhci: Unhandled USB event: %x s=%x p1=%x p2=%llx\n",
+                msg->cmd, msg->status, msg->param1, msg->param2);
+    }
+}
+
+
+
+static const struct hc_driver bce_vhci_driver = {
+        .description = "bce-vhci",
+        .product_desc = "BCE VHCI Host Controller",
+        .hcd_priv_size = sizeof(struct bce_vhci *),
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(5,4,0)
+        .flags = HCD_USB2,
+#else
+        .flags = HCD_USB2 | HCD_DMA,
+#endif
+
+        .start = bce_vhci_start,
+        .stop = bce_vhci_stop,
+        .hub_status_data = bce_vhci_hub_status_data,
+        .hub_control = bce_vhci_hub_control,
+        .urb_enqueue = bce_vhci_urb_enqueue,
+        .urb_dequeue = bce_vhci_urb_dequeue,
+        .enable_device = bce_vhci_enable_device,
+        .free_dev = bce_vhci_free_device,
+        .address_device = bce_vhci_address_device,
+        .add_endpoint = bce_vhci_add_endpoint,
+        .drop_endpoint = bce_vhci_drop_endpoint,
+        .endpoint_reset = bce_vhci_endpoint_reset,
+        .check_bandwidth = bce_vhci_check_bandwidth,
+        .get_frame_number = bce_vhci_get_frame_number,
+        .bus_suspend = bce_vhci_bus_suspend,
+        .bus_resume = bce_vhci_bus_resume
+};
+
+
+int __init bce_vhci_module_init(void)
+{
+    int result;
+    if ((result = alloc_chrdev_region(&bce_vhci_chrdev, 0, 1, "bce-vhci")))
+        goto fail_chrdev;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(6,4,0)
+    bce_vhci_class = class_create(THIS_MODULE, "bce-vhci");
+#else
+    bce_vhci_class = class_create("bce-vhci");
+#endif
+    if (IS_ERR(bce_vhci_class)) {
+        result = PTR_ERR(bce_vhci_class);
+        goto fail_class;
+    }
+    return 0;
+
+fail_class:
+    class_destroy(bce_vhci_class);
+fail_chrdev:
+    unregister_chrdev_region(bce_vhci_chrdev, 1);
+    if (!result)
+        result = -EINVAL;
+    return result;
+}
+void __exit bce_vhci_module_exit(void)
+{
+    class_destroy(bce_vhci_class);
+    unregister_chrdev_region(bce_vhci_chrdev, 1);
+}
+
+module_param_named(vhci_port_mask, bce_vhci_port_mask, ushort, 0444);
+MODULE_PARM_DESC(vhci_port_mask, "Specifies which VHCI ports are enabled");
diff --git a/drivers/staging/apple-bce/vhci/vhci.h b/drivers/staging/apple-bce/vhci/vhci.h
new file mode 100644
index 000000000000..6c2e22622f4c
--- /dev/null
+++ b/drivers/staging/apple-bce/vhci/vhci.h
@@ -0,0 +1,52 @@
+#ifndef BCE_VHCI_H
+#define BCE_VHCI_H
+
+#include "queue.h"
+#include "transfer.h"
+
+struct usb_hcd;
+struct bce_queue_cq;
+
+struct bce_vhci_device {
+    struct bce_vhci_transfer_queue tq[32];
+    u32 tq_mask;
+};
+struct bce_vhci {
+    struct apple_bce_device *dev;
+    dev_t vdevt;
+    struct device *vdev;
+    struct usb_hcd *hcd;
+    struct spinlock hcd_spinlock;
+    struct bce_vhci_message_queue msg_commands;
+    struct bce_vhci_message_queue msg_system;
+    struct bce_vhci_message_queue msg_isochronous;
+    struct bce_vhci_message_queue msg_interrupt;
+    struct bce_vhci_message_queue msg_asynchronous;
+    struct spinlock msg_asynchronous_lock;
+    struct bce_vhci_command_queue cq;
+    struct bce_queue_cq *ev_cq;
+    struct bce_vhci_event_queue ev_commands;
+    struct bce_vhci_event_queue ev_system;
+    struct bce_vhci_event_queue ev_isochronous;
+    struct bce_vhci_event_queue ev_interrupt;
+    struct bce_vhci_event_queue ev_asynchronous;
+    u16 port_mask;
+    u8 port_count;
+    u16 port_power_mask;
+    bce_vhci_device_t port_to_device[16];
+    struct bce_vhci_device *devices[16];
+    struct workqueue_struct *tq_state_wq;
+    struct work_struct w_fw_events;
+};
+
+int __init bce_vhci_module_init(void);
+void __exit bce_vhci_module_exit(void);
+
+int bce_vhci_create(struct apple_bce_device *dev, struct bce_vhci *vhci);
+void bce_vhci_destroy(struct bce_vhci *vhci);
+int bce_vhci_start(struct usb_hcd *hcd);
+void bce_vhci_stop(struct usb_hcd *hcd);
+
+struct bce_vhci *bce_vhci_from_hcd(struct usb_hcd *hcd);
+
+#endif //BCE_VHCI_H
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index e02ba15f6e34..b35734d03109 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -517,6 +517,19 @@ static int usb_unbind_interface(struct device *dev)
 	return 0;
 }
 
+static void usb_shutdown_interface(struct device *dev)
+{
+	struct usb_interface *intf = to_usb_interface(dev);
+	struct usb_driver *driver;
+
+	if (!dev->driver)
+		return;
+
+	driver = to_usb_driver(dev->driver);
+	if (driver->shutdown)
+		driver->shutdown(intf);
+}
+
 /**
  * usb_driver_claim_interface - bind a driver to an interface
  * @driver: the driver to be bound
@@ -1059,6 +1072,7 @@ int usb_register_driver(struct usb_driver *new_driver, struct module *owner,
 	new_driver->driver.bus = &usb_bus_type;
 	new_driver->driver.probe = usb_probe_interface;
 	new_driver->driver.remove = usb_unbind_interface;
+	new_driver->driver.shutdown = usb_shutdown_interface;
 	new_driver->driver.owner = owner;
 	new_driver->driver.mod_name = mod_name;
 	new_driver->driver.dev_groups = new_driver->dev_groups;
diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c
index a04b4cb1382d..e89dacd4657d 100644
--- a/drivers/usb/storage/uas.c
+++ b/drivers/usb/storage/uas.c
@@ -1233,9 +1233,8 @@ static void uas_disconnect(struct usb_interface *intf)
  * hang on reboot when the device is still in uas mode. Note the reset is
  * necessary as some devices won't revert to usb-storage mode without it.
  */
-static void uas_shutdown(struct device *dev)
+static void uas_shutdown(struct usb_interface *intf)
 {
-	struct usb_interface *intf = to_usb_interface(dev);
 	struct usb_device *udev = interface_to_usbdev(intf);
 	struct Scsi_Host *shost = usb_get_intfdata(intf);
 	struct uas_dev_info *devinfo = (struct uas_dev_info *)shost->hostdata;
@@ -1258,7 +1257,7 @@ static struct usb_driver uas_driver = {
 	.suspend = uas_suspend,
 	.resume = uas_resume,
 	.reset_resume = uas_reset_resume,
-	.driver.shutdown = uas_shutdown,
+	.shutdown = uas_shutdown,
 	.id_table = uas_usb_ids,
 };
 
diff --git a/include/drm/drm_format_helper.h b/include/drm/drm_format_helper.h
index 428d81afe215..aa1604d92c1a 100644
--- a/include/drm/drm_format_helper.h
+++ b/include/drm/drm_format_helper.h
@@ -96,6 +96,9 @@ void drm_fb_xrgb8888_to_rgba5551(struct iosys_map *dst, const unsigned int *dst_
 void drm_fb_xrgb8888_to_rgb888(struct iosys_map *dst, const unsigned int *dst_pitch,
 			       const struct iosys_map *src, const struct drm_framebuffer *fb,
 			       const struct drm_rect *clip, struct drm_format_conv_state *state);
+void drm_fb_xrgb8888_to_bgr888(struct iosys_map *dst, const unsigned int *dst_pitch,
+			       const struct iosys_map *src, const struct drm_framebuffer *fb,
+			       const struct drm_rect *clip, struct drm_format_conv_state *state);
 void drm_fb_xrgb8888_to_argb8888(struct iosys_map *dst, const unsigned int *dst_pitch,
 				 const struct iosys_map *src, const struct drm_framebuffer *fb,
 				 const struct drm_rect *clip, struct drm_format_conv_state *state);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 418e555459da..3a6c04a9f9aa 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -74,10 +74,10 @@ typedef void *efi_handle_t;
  */
 typedef guid_t efi_guid_t __aligned(__alignof__(u32));
 
-#define EFI_GUID(a, b, c, d...) (efi_guid_t){ {					\
+#define EFI_GUID(a, b, c, d...) ((efi_guid_t){ {				\
 	(a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff,	\
 	(b) & 0xff, ((b) >> 8) & 0xff,						\
-	(c) & 0xff, ((c) >> 8) & 0xff, d } }
+	(c) & 0xff, ((c) >> 8) & 0xff, d } })
 
 /*
  * Generic EFI table header
@@ -385,6 +385,7 @@ void efi_native_runtime_setup(void);
 #define EFI_MEMORY_ATTRIBUTES_TABLE_GUID	EFI_GUID(0xdcfa911d, 0x26eb, 0x469f,  0xa2, 0x20, 0x38, 0xb7, 0xdc, 0x46, 0x12, 0x20)
 #define EFI_CONSOLE_OUT_DEVICE_GUID		EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4,  0x9a, 0x46, 0x00, 0x90, 0x27, 0x3f, 0xc1, 0x4d)
 #define APPLE_PROPERTIES_PROTOCOL_GUID		EFI_GUID(0x91bd12fe, 0xf6c3, 0x44fb,  0xa5, 0xb7, 0x51, 0x22, 0xab, 0x30, 0x3a, 0xe0)
+#define APPLE_SET_OS_PROTOCOL_GUID		EFI_GUID(0xc5c5da95, 0x7d5c, 0x45e6,  0xb2, 0xf1, 0x3f, 0xd5, 0x2b, 0xb1, 0x00, 0x77)
 #define EFI_TCG2_PROTOCOL_GUID			EFI_GUID(0x607f766c, 0x7455, 0x42be,  0x93, 0x0b, 0xe4, 0xd7, 0x6d, 0xb2, 0x72, 0x0f)
 #define EFI_TCG2_FINAL_EVENTS_TABLE_GUID	EFI_GUID(0x1e2ed096, 0x30e2, 0x4254,  0xbd, 0x89, 0x86, 0x3b, 0xbe, 0xf8, 0x23, 0x25)
 #define EFI_LOAD_FILE_PROTOCOL_GUID		EFI_GUID(0x56ec3091, 0x954c, 0x11d2,  0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b)
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 8e06d89698e6..6cdb5a451453 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -940,6 +940,8 @@ extern void hidinput_report_event(struct hid_device *hid, struct hid_report *rep
 extern int hidinput_connect(struct hid_device *hid, unsigned int force);
 extern void hidinput_disconnect(struct hid_device *);
 
+struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_type,
+				 unsigned int application, unsigned int usage);
 int hid_set_field(struct hid_field *, unsigned, __s32);
 int hid_input_report(struct hid_device *hid, enum hid_report_type type, u8 *data, u32 size,
 		     int interrupt);
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 1913a13833f2..832997a9da0a 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1171,6 +1171,7 @@ extern ssize_t usb_show_dynids(struct usb_dynids *dynids, char *buf);
  *	post_reset method is called.
  * @post_reset: Called by usb_reset_device() after the device
  *	has been reset
+ * @shutdown: Called at shut-down time to quiesce the device.
  * @id_table: USB drivers use ID table to support hotplugging.
  *	Export this with MODULE_DEVICE_TABLE(usb,...).  This must be set
  *	or your driver's probe function will never get called.
@@ -1222,6 +1223,8 @@ struct usb_driver {
 	int (*pre_reset)(struct usb_interface *intf);
 	int (*post_reset)(struct usb_interface *intf);
 
+	void (*shutdown)(struct usb_interface *intf);
+
 	const struct usb_device_id *id_table;
 	const struct attribute_group **dev_groups;
 
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 69b6a5e177f2..a318bb72a165 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -745,18 +745,26 @@ static void __init fwnode_pointer(void)
 static void __init fourcc_pointer(void)
 {
 	struct {
+		char type;
 		u32 code;
 		char *str;
 	} const try[] = {
-		{ 0x3231564e, "NV12 little-endian (0x3231564e)", },
-		{ 0xb231564e, "NV12 big-endian (0xb231564e)", },
-		{ 0x10111213, ".... little-endian (0x10111213)", },
-		{ 0x20303159, "Y10  little-endian (0x20303159)", },
+		{ 'c', 0x3231564e, "NV12 little-endian (0x3231564e)", },
+		{ 'c', 0xb231564e, "NV12 big-endian (0xb231564e)", },
+		{ 'c', 0x10111213, ".... little-endian (0x10111213)", },
+		{ 'c', 0x20303159, "Y10  little-endian (0x20303159)", },
+		{ 'h', 0x67503030, "gP00 (0x67503030)", },
+		{ 'r', 0x30305067, "gP00 (0x67503030)", },
+		{ 'l', cpu_to_le32(0x67503030), "gP00 (0x67503030)", },
+		{ 'b', cpu_to_be32(0x67503030), "gP00 (0x67503030)", },
 	};
 	unsigned int i;
 
-	for (i = 0; i < ARRAY_SIZE(try); i++)
-		test(try[i].str, "%p4cc", &try[i].code);
+	for (i = 0; i < ARRAY_SIZE(try); i++) {
+		char fmt[] = { '%', 'p', '4', 'c', try[i].type, '\0' };
+
+		test(try[i].str, fmt, &try[i].code);
+	}
 }
 
 static void __init
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index cdd4e2314bfc..4feaea1815fa 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1760,27 +1760,50 @@ char *fourcc_string(char *buf, char *end, const u32 *fourcc,
 	char output[sizeof("0123 little-endian (0x01234567)")];
 	char *p = output;
 	unsigned int i;
+	bool pix_fmt = false;
 	u32 orig, val;
 
-	if (fmt[1] != 'c' || fmt[2] != 'c')
+	if (fmt[1] != 'c')
 		return error_string(buf, end, "(%p4?)", spec);
 
 	if (check_pointer(&buf, end, fourcc, spec))
 		return buf;
 
 	orig = get_unaligned(fourcc);
-	val = orig & ~BIT(31);
+	switch (fmt[2]) {
+	case 'h':
+		val = orig;
+		break;
+	case 'r':
+		val = orig = swab32(orig);
+		break;
+	case 'l':
+		val = orig = le32_to_cpu(orig);
+		break;
+	case 'b':
+		val = orig = be32_to_cpu(orig);
+		break;
+	case 'c':
+		/* Pixel formats are printed LSB-first */
+		val = swab32(orig & ~BIT(31));
+		pix_fmt = true;
+		break;
+	default:
+		return error_string(buf, end, "(%p4?)", spec);
+	}
 
 	for (i = 0; i < sizeof(u32); i++) {
-		unsigned char c = val >> (i * 8);
+		unsigned char c = val >> ((3 - i) * 8);
 
 		/* Print non-control ASCII characters as-is, dot otherwise */
 		*p++ = isascii(c) && isprint(c) ? c : '.';
 	}
 
-	*p++ = ' ';
-	strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian");
-	p += strlen(p);
+	if (pix_fmt) {
+		*p++ = ' ';
+		strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian");
+		p += strlen(p);
+	}
 
 	*p++ = ' ';
 	*p++ = '(';
@@ -2355,6 +2378,7 @@ char *rust_fmt_argument(char *buf, char *end, void *ptr);
  *       read the documentation (path below) first.
  * - 'NF' For a netdev_features_t
  * - '4cc' V4L2 or DRM FourCC code, with endianness and raw numerical value.
+ * - '4c[hlbr]' Generic FourCC code.
  * - 'h[CDN]' For a variable-length buffer, it prints it as a hex string with
  *            a certain separator (' ' by default):
  *              C colon
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 2b812210b412..4c3a8cc6ef15 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -6909,7 +6909,7 @@ sub process {
 					    ($extension eq "f" &&
 					     defined $qualifier && $qualifier !~ /^w/) ||
 					    ($extension eq "4" &&
-					     defined $qualifier && $qualifier !~ /^cc/)) {
+					     defined $qualifier && $qualifier !~ /^c[chlbr]/)) {
 						$bad_specifier = $specifier;
 						last;
 					}
-- 
2.46.0

From 859a4a5da2b5d6dc06d3ebba0360380a479c7ed5 Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Thu, 12 Sep 2024 11:47:49 +0200
Subject: [PATCH 12/12] zstd

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 include/linux/zstd.h                          |    2 +-
 include/linux/zstd_errors.h                   |   23 +-
 include/linux/zstd_lib.h                      |  850 +++++--
 lib/zstd/Makefile                             |    2 +-
 lib/zstd/common/allocations.h                 |   56 +
 lib/zstd/common/bits.h                        |  149 ++
 lib/zstd/common/bitstream.h                   |  127 +-
 lib/zstd/common/compiler.h                    |  134 +-
 lib/zstd/common/cpu.h                         |    3 +-
 lib/zstd/common/debug.c                       |    9 +-
 lib/zstd/common/debug.h                       |   34 +-
 lib/zstd/common/entropy_common.c              |   42 +-
 lib/zstd/common/error_private.c               |   12 +-
 lib/zstd/common/error_private.h               |   84 +-
 lib/zstd/common/fse.h                         |   94 +-
 lib/zstd/common/fse_decompress.c              |  130 +-
 lib/zstd/common/huf.h                         |  237 +-
 lib/zstd/common/mem.h                         |    3 +-
 lib/zstd/common/portability_macros.h          |   28 +-
 lib/zstd/common/zstd_common.c                 |   38 +-
 lib/zstd/common/zstd_deps.h                   |   16 +-
 lib/zstd/common/zstd_internal.h               |  109 +-
 lib/zstd/compress/clevels.h                   |    3 +-
 lib/zstd/compress/fse_compress.c              |   74 +-
 lib/zstd/compress/hist.c                      |    3 +-
 lib/zstd/compress/hist.h                      |    3 +-
 lib/zstd/compress/huf_compress.c              |  441 ++--
 lib/zstd/compress/zstd_compress.c             | 2111 ++++++++++++-----
 lib/zstd/compress/zstd_compress_internal.h    |  359 ++-
 lib/zstd/compress/zstd_compress_literals.c    |  155 +-
 lib/zstd/compress/zstd_compress_literals.h    |   25 +-
 lib/zstd/compress/zstd_compress_sequences.c   |    7 +-
 lib/zstd/compress/zstd_compress_sequences.h   |    3 +-
 lib/zstd/compress/zstd_compress_superblock.c  |  376 ++-
 lib/zstd/compress/zstd_compress_superblock.h  |    3 +-
 lib/zstd/compress/zstd_cwksp.h                |  169 +-
 lib/zstd/compress/zstd_double_fast.c          |  143 +-
 lib/zstd/compress/zstd_double_fast.h          |   17 +-
 lib/zstd/compress/zstd_fast.c                 |  596 +++--
 lib/zstd/compress/zstd_fast.h                 |    6 +-
 lib/zstd/compress/zstd_lazy.c                 |  732 +++---
 lib/zstd/compress/zstd_lazy.h                 |  138 +-
 lib/zstd/compress/zstd_ldm.c                  |   21 +-
 lib/zstd/compress/zstd_ldm.h                  |    3 +-
 lib/zstd/compress/zstd_ldm_geartab.h          |    3 +-
 lib/zstd/compress/zstd_opt.c                  |  497 ++--
 lib/zstd/compress/zstd_opt.h                  |   41 +-
 lib/zstd/decompress/huf_decompress.c          |  887 ++++---
 lib/zstd/decompress/zstd_ddict.c              |    9 +-
 lib/zstd/decompress/zstd_ddict.h              |    3 +-
 lib/zstd/decompress/zstd_decompress.c         |  358 ++-
 lib/zstd/decompress/zstd_decompress_block.c   |  708 +++---
 lib/zstd/decompress/zstd_decompress_block.h   |   10 +-
 .../decompress/zstd_decompress_internal.h     |    9 +-
 lib/zstd/decompress_sources.h                 |    2 +-
 lib/zstd/zstd_common_module.c                 |    5 +-
 lib/zstd/zstd_compress_module.c               |    2 +-
 lib/zstd/zstd_decompress_module.c             |    4 +-
 58 files changed, 6577 insertions(+), 3531 deletions(-)
 create mode 100644 lib/zstd/common/allocations.h
 create mode 100644 lib/zstd/common/bits.h

diff --git a/include/linux/zstd.h b/include/linux/zstd.h
index 113408eef6ec..f109d49f43f8 100644
--- a/include/linux/zstd.h
+++ b/include/linux/zstd.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
index 58b6dd45a969..6d5cf55f0bf3 100644
--- a/include/linux/zstd_errors.h
+++ b/include/linux/zstd_errors.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -17,8 +18,17 @@
 
 
 /* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
-#define ZSTDERRORLIB_VISIBILITY 
-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+#define ZSTDERRORLIB_VISIBLE 
+
+#ifndef ZSTDERRORLIB_HIDDEN
+#  if (__GNUC__ >= 4) && !defined(__MINGW32__)
+#    define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+#  else
+#    define ZSTDERRORLIB_HIDDEN
+#  endif
+#endif
+
+#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE
 
 /*-*********************************************
  *  Error codes list
@@ -43,14 +53,17 @@ typedef enum {
   ZSTD_error_frameParameter_windowTooLarge = 16,
   ZSTD_error_corruption_detected = 20,
   ZSTD_error_checksum_wrong      = 22,
+  ZSTD_error_literals_headerWrong = 24,
   ZSTD_error_dictionary_corrupted      = 30,
   ZSTD_error_dictionary_wrong          = 32,
   ZSTD_error_dictionaryCreation_failed = 34,
   ZSTD_error_parameter_unsupported   = 40,
+  ZSTD_error_parameter_combination_unsupported = 41,
   ZSTD_error_parameter_outOfBound    = 42,
   ZSTD_error_tableLog_tooLarge       = 44,
   ZSTD_error_maxSymbolValue_tooLarge = 46,
   ZSTD_error_maxSymbolValue_tooSmall = 48,
+  ZSTD_error_stabilityCondition_notRespected = 50,
   ZSTD_error_stage_wrong       = 60,
   ZSTD_error_init_missing      = 62,
   ZSTD_error_memory_allocation = 64,
@@ -58,11 +71,15 @@ typedef enum {
   ZSTD_error_dstSize_tooSmall = 70,
   ZSTD_error_srcSize_wrong    = 72,
   ZSTD_error_dstBuffer_null   = 74,
+  ZSTD_error_noForwardProgress_destFull = 80,
+  ZSTD_error_noForwardProgress_inputEmpty = 82,
   /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
   ZSTD_error_frameIndex_tooLarge = 100,
   ZSTD_error_seekableIO          = 102,
   ZSTD_error_dstBuffer_wrong     = 104,
   ZSTD_error_srcBuffer_wrong     = 105,
+  ZSTD_error_sequenceProducer_failed = 106,
+  ZSTD_error_externalSequences_invalid = 107,
   ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
 } ZSTD_ErrorCode;
 
diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
index 79d55465d5c1..6320fedcf8a4 100644
--- a/include/linux/zstd_lib.h
+++ b/include/linux/zstd_lib.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,23 +12,42 @@
 #ifndef ZSTD_H_235446
 #define ZSTD_H_235446
 
-/* ======   Dependency   ======*/
+/* ======   Dependencies   ======*/
 #include <linux/limits.h>   /* INT_MAX */
 #include <linux/types.h>   /* size_t */
 
 
 /* =====   ZSTDLIB_API : control library symbols visibility   ===== */
-#ifndef ZSTDLIB_VISIBLE
+#define ZSTDLIB_VISIBLE 
+
+#ifndef ZSTDLIB_HIDDEN
 #  if (__GNUC__ >= 4) && !defined(__MINGW32__)
-#    define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
 #    define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
 #  else
-#    define ZSTDLIB_VISIBLE
 #    define ZSTDLIB_HIDDEN
 #  endif
 #endif
+
 #define ZSTDLIB_API ZSTDLIB_VISIBLE
 
+/* Deprecation warnings :
+ * Should these warnings be a problem, it is generally possible to disable them,
+ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
+ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
+ */
+#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
+#  define ZSTD_DEPRECATED(message) /* disable deprecation warnings */
+#else
+#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
+#    define ZSTD_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif (__GNUC__ >= 3)
+#    define ZSTD_DEPRECATED(message) __attribute__((deprecated))
+#  else
+#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
+#    define ZSTD_DEPRECATED(message)
+#  endif
+#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
+
 
 /* *****************************************************************************
   Introduction
@@ -65,7 +85,7 @@
 /*------   Version   ------*/
 #define ZSTD_VERSION_MAJOR    1
 #define ZSTD_VERSION_MINOR    5
-#define ZSTD_VERSION_RELEASE  2
+#define ZSTD_VERSION_RELEASE  6
 #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
 
 /*! ZSTD_versionNumber() :
@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
 ***************************************/
 /*! ZSTD_compress() :
  *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
+ *        enough space to successfully compress the data.
  *  @return : compressed size written into `dst` (<= `dstCapacity),
  *            or an error code if it fails (which can be tested using ZSTD_isError()). */
 ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t
  *  "empty", "unknown" and "error" results to the same return value (0),
  *  while ZSTD_getFrameContentSize() gives them separate return values.
  * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize")
+ZSTDLIB_API
+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
 
 /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
  * `src` should point to the start of a ZSTD frame or skippable frame.
@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize)
 
 
 /*======  Helper functions  ======*/
-#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
-ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+/* ZSTD_compressBound() :
+ * maximum compressed size in worst case single-pass scenario.
+ * When invoking `ZSTD_compress()` or any other one-pass compression function,
+ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize)
+ * as it eliminates one potential failure scenario,
+ * aka not enough room in dst buffer to write the compressed frame.
+ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE .
+ *        In which case, ZSTD_compressBound() will return an error code
+ *        which can be tested using ZSTD_isError().
+ *
+ * ZSTD_COMPRESSBOUND() :
+ * same as ZSTD_compressBound(), but as a macro.
+ * It can be used to produce constants, which can be useful for static allocation,
+ * for example to size a static array on stack.
+ * Will produce constant value 0 if srcSize too large.
+ */
+#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U)
+#define ZSTD_COMPRESSBOUND(srcSize)   (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+/* ZSTD_isError() :
+ * Most ZSTD_* functions returning a size_t value can be tested for error,
+ * using ZSTD_isError().
+ * @return 1 if error, 0 otherwise
+ */
 ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
 ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
 ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed, requires v1.4.0+ */
@@ -183,7 +228,7 @@ ZSTDLIB_API int         ZSTD_defaultCLevel(void);           /*!< default compres
 /*= Compression context
  *  When compressing many times,
  *  it is recommended to allocate a context just once,
- *  and re-use it for each successive compression operation.
+ *  and reuse it for each successive compression operation.
  *  This will make workload friendlier for system's memory.
  *  Note : re-using context is just a speed / resource optimization.
  *         It doesn't change the compression ratio, which remains identical.
@@ -196,9 +241,9 @@ ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);  /* accept NULL pointer *
 
 /*! ZSTD_compressCCtx() :
  *  Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
- *  Important : in order to behave similarly to `ZSTD_compress()`,
- *  this function compresses at requested compression level,
- *  __ignoring any other parameter__ .
+ *  Important : in order to mirror `ZSTD_compress()` behavior,
+ *  this function compresses at the requested compression level,
+ *  __ignoring any other advanced parameter__ .
  *  If any advanced parameter was set using the advanced API,
  *  they will all be reset. Only `compressionLevel` remains.
  */
@@ -210,7 +255,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
 /*= Decompression context
  *  When decompressing many times,
  *  it is recommended to allocate a context only once,
- *  and re-use it for each successive compression operation.
+ *  and reuse it for each successive compression operation.
  *  This will make workload friendlier for system's memory.
  *  Use one context per thread for parallel execution. */
 typedef struct ZSTD_DCtx_s ZSTD_DCtx;
@@ -220,7 +265,7 @@ ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);  /* accept NULL pointer *
 /*! ZSTD_decompressDCtx() :
  *  Same as ZSTD_decompress(),
  *  requires an allocated ZSTD_DCtx.
- *  Compatible with sticky parameters.
+ *  Compatible with sticky parameters (see below).
  */
 ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
                                        void* dst, size_t dstCapacity,
@@ -236,12 +281,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
  *   using ZSTD_CCtx_set*() functions.
  *   Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
  *   "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
- *   __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
+ *   __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ .
  *
  *   It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
  *
  *   This API supersedes all other "advanced" API entry points in the experimental section.
- *   In the future, we expect to remove from experimental API entry points which are redundant with this API.
+ *   In the future, we expect to remove API entry points from experimental which are redundant with this API.
  */
 
 
@@ -324,6 +369,19 @@ typedef enum {
                               * The higher the value of selected strategy, the more complex it is,
                               * resulting in stronger and slower compression.
                               * Special: value 0 means "use default strategy". */
+
+    ZSTD_c_targetCBlockSize=130, /* v1.5.6+
+                                  * Attempts to fit compressed block size into approximatively targetCBlockSize.
+                                  * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX.
+                                  * Note that it's not a guarantee, just a convergence target (default:0).
+                                  * No target when targetCBlockSize == 0.
+                                  * This is helpful in low bandwidth streaming environments to improve end-to-end latency,
+                                  * when a client can make use of partial documents (a prominent example being Chrome).
+                                  * Note: this parameter is stable since v1.5.6.
+                                  * It was present as an experimental parameter in earlier versions,
+                                  * but it's not recommended using it with earlier library versions
+                                  * due to massive performance regressions.
+                                  */
     /* LDM mode parameters */
     ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
                                      * This parameter is designed to improve compression ratio
@@ -403,7 +461,6 @@ typedef enum {
      * ZSTD_c_forceMaxWindow
      * ZSTD_c_forceAttachDict
      * ZSTD_c_literalCompressionMode
-     * ZSTD_c_targetCBlockSize
      * ZSTD_c_srcSizeHint
      * ZSTD_c_enableDedicatedDictSearch
      * ZSTD_c_stableInBuffer
@@ -412,6 +469,9 @@ typedef enum {
      * ZSTD_c_validateSequences
      * ZSTD_c_useBlockSplitter
      * ZSTD_c_useRowMatchFinder
+     * ZSTD_c_prefetchCDictTables
+     * ZSTD_c_enableSeqProducerFallback
+     * ZSTD_c_maxBlockSize
      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
      * note : never ever use experimentalParam? names directly;
      *        also, the enums values themselves are unstable and can still change.
@@ -421,7 +481,7 @@ typedef enum {
      ZSTD_c_experimentalParam3=1000,
      ZSTD_c_experimentalParam4=1001,
      ZSTD_c_experimentalParam5=1002,
-     ZSTD_c_experimentalParam6=1003,
+     /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */
      ZSTD_c_experimentalParam7=1004,
      ZSTD_c_experimentalParam8=1005,
      ZSTD_c_experimentalParam9=1006,
@@ -430,7 +490,11 @@ typedef enum {
      ZSTD_c_experimentalParam12=1009,
      ZSTD_c_experimentalParam13=1010,
      ZSTD_c_experimentalParam14=1011,
-     ZSTD_c_experimentalParam15=1012
+     ZSTD_c_experimentalParam15=1012,
+     ZSTD_c_experimentalParam16=1013,
+     ZSTD_c_experimentalParam17=1014,
+     ZSTD_c_experimentalParam18=1015,
+     ZSTD_c_experimentalParam19=1016
 } ZSTD_cParameter;
 
 typedef struct {
@@ -493,7 +557,7 @@ typedef enum {
  *                  They will be used to compress next frame.
  *                  Resetting session never fails.
  *  - The parameters : changes all parameters back to "default".
- *                  This removes any reference to any dictionary too.
+ *                  This also removes any reference to any dictionary or external sequence producer.
  *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
  *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
  *  - Both : similar to resetting the session, followed by resetting parameters.
@@ -502,11 +566,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
 
 /*! ZSTD_compress2() :
  *  Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
+ *  (note that this entry point doesn't even expose a compression level parameter).
  *  ZSTD_compress2() always starts a new frame.
  *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
  *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
  *  - The function is always blocking, returns when compression is completed.
- *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have
+ *        enough space to successfully compress the data, though it is possible it fails for other reasons.
  * @return : compressed size written into `dst` (<= `dstCapacity),
  *           or an error code if it fails (which can be tested using ZSTD_isError()).
  */
@@ -543,13 +609,17 @@ typedef enum {
      * ZSTD_d_stableOutBuffer
      * ZSTD_d_forceIgnoreChecksum
      * ZSTD_d_refMultipleDDicts
+     * ZSTD_d_disableHuffmanAssembly
+     * ZSTD_d_maxBlockSize
      * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
      * note : never ever use experimentalParam? names directly
      */
      ZSTD_d_experimentalParam1=1000,
      ZSTD_d_experimentalParam2=1001,
      ZSTD_d_experimentalParam3=1002,
-     ZSTD_d_experimentalParam4=1003
+     ZSTD_d_experimentalParam4=1003,
+     ZSTD_d_experimentalParam5=1004,
+     ZSTD_d_experimentalParam6=1005
 
 } ZSTD_dParameter;
 
@@ -604,14 +674,14 @@ typedef struct ZSTD_outBuffer_s {
 *  A ZSTD_CStream object is required to track streaming operation.
 *  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
 *  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
-*  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+*  It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
 *
 *  For parallel execution, use one separate ZSTD_CStream per thread.
 *
 *  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
 *
 *  Parameters are sticky : when starting a new compression on the same context,
-*  it will re-use the same sticky parameters as previous compression session.
+*  it will reuse the same sticky parameters as previous compression session.
 *  When in doubt, it's recommended to fully initialize the context before usage.
 *  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
 *  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
@@ -700,6 +770,11 @@ typedef enum {
  *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
  *            Before starting a new compression job, or changing compression parameters,
  *            it is required to fully flush internal buffers.
+ *  - note: if an operation ends with an error, it may leave @cctx in an undefined state.
+ *          Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state.
+ *          In order to be re-employed after an error, a state must be reset,
+ *          which can be done explicitly (ZSTD_CCtx_reset()),
+ *          or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx())
  */
 ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
                                          ZSTD_outBuffer* output,
@@ -728,8 +803,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
  * This following is a legacy streaming API, available since v1.0+ .
  * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
  * It is redundant, but remains fully supported.
- * Streaming in combination with advanced parameters and dictionary compression
- * can only be used through the new API.
  ******************************************************************************/
 
 /*!
@@ -738,6 +811,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< recommended size for output
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
  *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
  *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ *
+ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API
+ * to compress with a dictionary.
  */
 ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
 /*!
@@ -758,7 +834,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
 *
 *  A ZSTD_DStream object is required to track streaming operations.
 *  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
-*  ZSTD_DStream objects can be re-used multiple times.
+*  ZSTD_DStream objects can be reused multiple times.
 *
 *  Use ZSTD_initDStream() to start a new decompression operation.
 * @return : recommended first input size
@@ -788,13 +864,37 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);  /* accept NULL pointer
 
 /*===== Streaming decompression functions =====*/
 
-/* This function is redundant with the advanced API and equivalent to:
+/*! ZSTD_initDStream() :
+ * Initialize/reset DStream state for new decompression operation.
+ * Call before new decompression operation using same DStream.
  *
+ * Note : This function is redundant with the advanced API and equivalent to:
  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
  *     ZSTD_DCtx_refDDict(zds, NULL);
  */
 ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
 
+/*! ZSTD_decompressStream() :
+ * Streaming decompression function.
+ * Call repetitively to consume full input updating it as necessary.
+ * Function will update both input and output `pos` fields exposing current state via these fields:
+ * - `input.pos < input.size`, some input remaining and caller should provide remaining input
+ *   on the next call.
+ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers.
+ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers,
+ *   call ZSTD_decompressStream() again to flush remaining data to output.
+ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX.
+ *
+ * @return : 0 when a frame is completely decoded and fully flushed,
+ *           or an error code, which can be tested using ZSTD_isError(),
+ *           or any other value > 0, which means there is some decoding or flushing to do to complete current frame.
+ *
+ * Note: when an operation returns with an error code, the @zds state may be left in undefined state.
+ *       It's UB to invoke `ZSTD_decompressStream()` on such a state.
+ *       In order to re-use such a state, it must be first reset,
+ *       which can be done explicitly (`ZSTD_DCtx_reset()`),
+ *       or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`)
+ */
 ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
 
 ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
@@ -913,7 +1013,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
  *  If @return == 0, the dictID could not be decoded.
  *  This could for one of the following reasons :
  *  - The frame does not require a dictionary to be decoded (most common case).
- *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information.
  *    Note : this use case also happens when using a non-conformant dictionary.
  *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
  *  - This is not a Zstandard frame.
@@ -925,9 +1025,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
  * Advanced dictionary and prefix API (Requires v1.4.0+)
  *
  * This API allows dictionaries to be used with ZSTD_compress2(),
- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and
- * only reset with the context is reset with ZSTD_reset_parameters or
- * ZSTD_reset_session_and_parameters. Prefixes are single-use.
+ * ZSTD_compressStream2(), and ZSTD_decompressDCtx().
+ * Dictionaries are sticky, they remain valid when same context is reused,
+ * they only reset when the context is reset
+ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters.
+ * In contrast, Prefixes are single-use.
  ******************************************************************************/
 
 
@@ -937,8 +1039,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
  *           meaning "return to no-dictionary mode".
- *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
- *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
+ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames,
+ *           until parameters are reset, a new dictionary is loaded, or the dictionary
+ *           is explicitly invalidated by loading a NULL dictionary.
  *  Note 2 : Loading a dictionary involves building tables.
  *           It's also a CPU consuming operation, with non-negligible impact on latency.
  *           Tables are dependent on compression parameters, and for this reason,
@@ -947,11 +1050,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
  *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
  *           In such a case, dictionary buffer must outlive its users.
  *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
- *           to precisely select how dictionary content must be interpreted. */
+ *           to precisely select how dictionary content must be interpreted.
+ *  Note 5 : This method does not benefit from LDM (long distance mode).
+ *           If you want to employ LDM on some large dictionary content,
+ *           prefer employing ZSTD_CCtx_refPrefix() described below.
+ */
 ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
 
 /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
- *  Reference a prepared dictionary, to be used for all next compressed frames.
+ *  Reference a prepared dictionary, to be used for all future compressed frames.
  *  Note that compression parameters are enforced from within CDict,
  *  and supersede any compression parameter previously set within CCtx.
  *  The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
@@ -970,6 +1077,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
  *  Decompression will need same prefix to properly regenerate data.
  *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
  *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ *  This method is compatible with LDM (long distance mode).
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
  *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
@@ -986,9 +1094,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
                                  const void* prefix, size_t prefixSize);
 
 /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
- *  Create an internal DDict from dict buffer,
- *  to be used to decompress next frames.
- *  The dictionary remains valid for all future frames, until explicitly invalidated.
+ *  Create an internal DDict from dict buffer, to be used to decompress all future frames.
+ *  The dictionary remains valid for all future frames, until explicitly invalidated, or
+ *  a new dictionary is loaded.
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
  *            meaning "return to no-dictionary mode".
@@ -1012,9 +1120,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s
  *  The memory for the table is allocated on the first call to refDDict, and can be
  *  freed with ZSTD_freeDCtx().
  *
+ *  If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary
+ *  will be managed, and referencing a dictionary effectively "discards" any previous one.
+ *
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
- *  Note 1 : Currently, only one dictionary can be managed.
- *           Referencing a new dictionary effectively "discards" any previous one.
  *  Special: referencing a NULL DDict means "return to no-dictionary mode".
  *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
  */
@@ -1071,24 +1180,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
 #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
 #endif
 
-/* Deprecation warnings :
- * Should these warnings be a problem, it is generally possible to disable them,
- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual.
- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
- */
-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
-#  define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API  /* disable deprecation warnings */
-#else
-#  if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__)
-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message)))
-#  elif (__GNUC__ >= 3)
-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated))
-#  else
-#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler")
-#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API
-#  endif
-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
-
 /* **************************************************************************************
  *   experimental API (static linking only)
  ****************************************************************************************
@@ -1123,6 +1214,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
 #define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
 #define ZSTD_STRATEGY_MIN        ZSTD_fast
 #define ZSTD_STRATEGY_MAX        ZSTD_btultra2
+#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */
 
 
 #define ZSTD_OVERLAPLOG_MIN       0
@@ -1146,7 +1238,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
 #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
 
 /* Advanced parameter bounds */
-#define ZSTD_TARGETCBLOCKSIZE_MIN   64
+#define ZSTD_TARGETCBLOCKSIZE_MIN   1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */
 #define ZSTD_TARGETCBLOCKSIZE_MAX   ZSTD_BLOCKSIZE_MAX
 #define ZSTD_SRCSIZEHINT_MIN        0
 #define ZSTD_SRCSIZEHINT_MAX        INT_MAX
@@ -1303,7 +1395,7 @@ typedef enum {
 } ZSTD_paramSwitch_e;
 
 /* *************************************
-*  Frame size functions
+*  Frame header and size functions
 ***************************************/
 
 /*! ZSTD_findDecompressedSize() :
@@ -1350,29 +1442,122 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size
  *           or an error code (if srcSize is too small) */
 ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
 
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+typedef struct {
+    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+    unsigned blockSizeMax;
+    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+    unsigned headerSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+    unsigned _reserved1;
+    unsigned _reserved2;
+} ZSTD_frameHeader;
+
+/*! ZSTD_getFrameHeader() :
+ *  decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
+/*! ZSTD_getFrameHeader_advanced() :
+ *  same as ZSTD_getFrameHeader(),
+ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+
+/*! ZSTD_decompressionMargin() :
+ * Zstd supports in-place decompression, where the input and output buffers overlap.
+ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large,
+ * and the input buffer must be at the end of the output buffer.
+ *
+ *  _______________________ Output Buffer ________________________
+ * |                                                              |
+ * |                                        ____ Input Buffer ____|
+ * |                                       |                      |
+ * v                                       v                      v
+ * |---------------------------------------|-----------|----------|
+ * ^                                                   ^          ^
+ * |___________________ Output_Size ___________________|_ Margin _|
+ *
+ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN().
+ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or
+ * ZSTD_decompressDCtx().
+ * NOTE: This function supports multi-frame input.
+ *
+ * @param src The compressed frame(s)
+ * @param srcSize The size of the compressed frame(s)
+ * @returns The decompression margin or an error that can be checked with ZSTD_isError().
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize);
+
+/*! ZSTD_DECOMPRESS_MARGIN() :
+ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from
+ * the compressed frame, compute it from the original size and the blockSizeLog.
+ * See ZSTD_decompressionMargin() for details.
+ *
+ * WARNING: This macro does not support multi-frame input, the input must be a single
+ * zstd frame. If you need that support use the function, or implement it yourself.
+ *
+ * @param originalSize The original uncompressed size of the data.
+ * @param blockSize    The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX).
+ *                     Unless you explicitly set the windowLog smaller than
+ *                     ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX.
+ */
+#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)(                                              \
+        ZSTD_FRAMEHEADERSIZE_MAX                                                              /* Frame header */ + \
+        4                                                                                         /* checksum */ + \
+        ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \
+        (blockSize)                                                                    /* One block of margin */   \
+    ))
+
 typedef enum {
   ZSTD_sf_noBlockDelimiters = 0,         /* Representation of ZSTD_Sequence has no block delimiters, sequences only */
   ZSTD_sf_explicitBlockDelimiters = 1    /* Representation of ZSTD_Sequence contains explicit block delimiters */
 } ZSTD_sequenceFormat_e;
 
+/*! ZSTD_sequenceBound() :
+ * `srcSize` : size of the input buffer
+ *  @return : upper-bound for the number of sequences that can be generated
+ *            from a buffer of srcSize bytes
+ *
+ *  note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize);
+
 /*! ZSTD_generateSequences() :
- * Generate sequences using ZSTD_compress2, given a source buffer.
+ * WARNING: This function is meant for debugging and informational purposes ONLY!
+ * Its implementation is flawed, and it will be deleted in a future version.
+ * It is not guaranteed to succeed, as there are several cases where it will give
+ * up and fail. You should NOT use this function in production code.
+ *
+ * This function is deprecated, and will be removed in a future version.
+ *
+ * Generate sequences using ZSTD_compress2(), given a source buffer.
+ *
+ * @param zc The compression context to be used for ZSTD_compress2(). Set any
+ *           compression parameters you need on this context.
+ * @param outSeqs The output sequences buffer of size @p outSeqsSize
+ * @param outSeqsSize The size of the output sequences buffer.
+ *                    ZSTD_sequenceBound(srcSize) is an upper bound on the number
+ *                    of sequences that can be generated.
+ * @param src The source buffer to generate sequences from of size @p srcSize.
+ * @param srcSize The size of the source buffer.
  *
  * Each block will end with a dummy sequence
  * with offset == 0, matchLength == 0, and litLength == length of last literals.
  * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0)
  * simply acts as a block delimiter.
  *
- * zc can be used to insert custom compression params.
- * This function invokes ZSTD_compress2
- *
- * The output of this function can be fed into ZSTD_compressSequences() with CCtx
- * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters
- * @return : number of sequences generated
+ * @returns The number of sequences generated, necessarily less than
+ *          ZSTD_sequenceBound(srcSize), or an error code that can be checked
+ *          with ZSTD_isError().
  */
-
-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
-                                          size_t outSeqsSize, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()")
+ZSTDLIB_STATIC_API size_t
+ZSTD_generateSequences(ZSTD_CCtx* zc,
+                       ZSTD_Sequence* outSeqs, size_t outSeqsSize,
+                       const void* src, size_t srcSize);
 
 /*! ZSTD_mergeBlockDelimiters() :
  * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals
@@ -1388,7 +1573,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o
 ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize);
 
 /*! ZSTD_compressSequences() :
- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst.
+ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst.
+ * @src contains the entire input (not just the literals).
+ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals
  * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.)
  * The entire source is compressed into a single frame.
  *
@@ -1413,11 +1600,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si
  * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused.
  * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly,
  *         and cannot emit an RLE block that disagrees with the repcode history
- * @return : final compressed size or a ZSTD error.
+ * @return : final compressed size, or a ZSTD error code.
  */
-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize,
-                                  const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
-                                  const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t
+ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize,
+                        const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                        const void* src, size_t srcSize);
 
 
 /*! ZSTD_writeSkippableFrame() :
@@ -1464,48 +1652,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size);
 /*! ZSTD_estimate*() :
  *  These functions make it possible to estimate memory usage
  *  of a future {D,C}Ctx, before its creation.
+ *  This is useful in combination with ZSTD_initStatic(),
+ *  which makes it possible to employ a static buffer for ZSTD_CCtx* state.
  *
  *  ZSTD_estimateCCtxSize() will provide a memory budget large enough
- *  for any compression level up to selected one.
- *  Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
- *         does not include space for a window buffer.
- *         Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
+ *  to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2()
+ *  associated with any compression level up to max specified one.
  *  The estimate will assume the input may be arbitrarily large,
  *  which is the worst case.
  *
+ *  Note that the size estimation is specific for one-shot compression,
+ *  it is not valid for streaming (see ZSTD_estimateCStreamSize*())
+ *  nor other potential ways of using a ZSTD_CCtx* state.
+ *
  *  When srcSize can be bound by a known and rather "small" value,
- *  this fact can be used to provide a tighter estimation
- *  because the CCtx compression context will need less memory.
- *  This tighter estimation can be provided by more advanced functions
+ *  this knowledge can be used to provide a tighter budget estimation
+ *  because the ZSTD_CCtx* state will need less memory for small inputs.
+ *  This tighter estimation can be provided by employing more advanced functions
  *  ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
  *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
  *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
  *
- *  Note 2 : only single-threaded compression is supported.
+ *  Note : only single-threaded compression is supported.
  *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
  */
-ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
 
 /*! ZSTD_estimateCStreamSize() :
- *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
- *  It will also consider src size to be arbitrarily "large", which is worst case.
+ *  ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression
+ *  using any compression level up to the max specified one.
+ *  It will also consider src size to be arbitrarily "large", which is a worst case scenario.
  *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
  *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
  *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
  *  Note : CStream size estimation is only correct for single-threaded compression.
- *  ZSTD_DStream memory budget depends on window Size.
+ *  ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ *  Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time.
+ *  Size estimates assume that no external sequence producer is registered.
+ *
+ *  ZSTD_DStream memory budget depends on frame's window Size.
  *  This information can be passed manually, using ZSTD_estimateDStreamSize,
  *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+ *  Any frame requesting a window size larger than max specified one will be rejected.
  *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
  *         an internal ?Dict will be created, which additional size is not estimated here.
- *         In this case, get total size by adding ZSTD_estimate?DictSize */
-ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ *         In this case, get total size by adding ZSTD_estimate?DictSize
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
-ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize);
 ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
 
 /*! ZSTD_estimate?DictSize() :
@@ -1649,22 +1848,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
  *  This function never fails (wide contract) */
 ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
 
+/*! ZSTD_CCtx_setCParams() :
+ *  Set all parameters provided within @p cparams into the working @p cctx.
+ *  Note : if modifying parameters during compression (MT mode only),
+ *         note that changes to the .windowLog parameter will be ignored.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ *         On failure, no parameters are updated.
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams);
+
+/*! ZSTD_CCtx_setFParams() :
+ *  Set all parameters provided within @p fparams into the working @p cctx.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams);
+
+/*! ZSTD_CCtx_setParams() :
+ *  Set all parameters provided within @p params into the working @p cctx.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()).
+ */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params);
+
 /*! ZSTD_compress_advanced() :
  *  Note : this function is now DEPRECATED.
  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
  *  This prototype will generate compilation warnings. */
 ZSTD_DEPRECATED("use ZSTD_compress2")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
-                                          void* dst, size_t dstCapacity,
-                                    const void* src, size_t srcSize,
-                                    const void* dict,size_t dictSize,
-                                          ZSTD_parameters params);
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                        const void* dict,size_t dictSize,
+                              ZSTD_parameters params);
 
 /*! ZSTD_compress_usingCDict_advanced() :
  *  Note : this function is now DEPRECATED.
  *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
  *  This prototype will generate compilation warnings. */
 ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
                                               void* dst, size_t dstCapacity,
                                         const void* src, size_t srcSize,
@@ -1737,11 +1959,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  */
 #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
 
-/* Tries to fit compressed block size to be around targetCBlockSize.
- * No target when targetCBlockSize == 0.
- * There is no guarantee on compressed block size (default:0) */
-#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
-
 /* User's best guess of source size.
  * Hint is not valid when srcSizeHint == 0.
  * There is no guarantee that hint is close to actual source size,
@@ -1808,13 +2025,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  * Experimental parameter.
  * Default is 0 == disabled. Set to 1 to enable.
  *
- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same
- * between calls, except for the modifications that zstd makes to pos (the
- * caller must not modify pos). This is checked by the compressor, and
- * compression will fail if it ever changes. This means the only flush
- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end
- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos)
- * MUST not be modified during compression or you will get data corruption.
+ * Tells the compressor that input data presented with ZSTD_inBuffer
+ * will ALWAYS be the same between calls.
+ * Technically, the @src pointer must never be changed,
+ * and the @pos field can only be updated by zstd.
+ * However, it's possible to increase the @size field,
+ * allowing scenarios where more data can be appended after compressions starts.
+ * These conditions are checked by the compressor,
+ * and compression will fail if they are not respected.
+ * Also, data in the ZSTD_inBuffer within the range [src, src + pos)
+ * MUST not be modified during compression or it will result in data corruption.
  *
  * When this flag is enabled zstd won't allocate an input window buffer,
  * because the user guarantees it can reference the ZSTD_inBuffer until
@@ -1822,18 +2042,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also
  * avoid the memcpy() from the input buffer to the input window buffer.
  *
- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used.
- * That means this flag cannot be used with ZSTD_compressStream().
- *
  * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using
  * this flag is ALWAYS memory safe, and will never access out-of-bounds
- * memory. However, compression WILL fail if you violate the preconditions.
+ * memory. However, compression WILL fail if conditions are not respected.
  *
- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST
- * not be modified during compression or you will get data corruption. This
- * is because zstd needs to reference data in the ZSTD_inBuffer to find
+ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST
+ * not be modified during compression or it will result in data corruption.
+ * This is because zstd needs to reference data in the ZSTD_inBuffer to find
  * matches. Normally zstd maintains its own window buffer for this purpose,
- * but passing this flag tells zstd to use the user provided buffer.
+ * but passing this flag tells zstd to rely on user provided buffer instead.
  */
 #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9
 
@@ -1878,7 +2095,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  * Without validation, providing a sequence that does not conform to the zstd spec will cause
  * undefined behavior, and may produce a corrupted block.
  *
- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for
+ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for
  * specifics regarding offset/matchlength requirements) then the function will bail out and
  * return an error.
  *
@@ -1928,6 +2145,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo
  */
 #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
 
+/* ZSTD_c_prefetchCDictTables
+ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto.
+ *
+ * In some situations, zstd uses CDict tables in-place rather than copying them
+ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details).
+ * In such situations, compression speed is seriously impacted when CDict tables are
+ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables
+ * when they are used in-place.
+ *
+ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit.
+ * For sufficiently large inputs, zstd will by default memcpy() CDict tables
+ * into the working context, so there is no need to prefetch. This parameter is
+ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be
+ * useful but memcpy() is too expensive. The exact range of input sizes where this
+ * makes sense is best determined by careful experimentation.
+ *
+ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable,
+ * but in the future zstd may conditionally enable this feature via an auto-detection
+ * heuristic for cold CDicts.
+ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances.
+ */
+#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16
+
+/* ZSTD_c_enableSeqProducerFallback
+ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0.
+ *
+ * Controls whether zstd will fall back to an internal sequence producer if an
+ * external sequence producer is registered and returns an error code. This fallback
+ * is block-by-block: the internal sequence producer will only be called for blocks
+ * where the external sequence producer returns an error code. Fallback parsing will
+ * follow any other cParam settings, such as compression level, the same as in a
+ * normal (fully-internal) compression operation.
+ *
+ * The user is strongly encouraged to read the full Block-Level Sequence Producer API
+ * documentation (below) before setting this parameter. */
+#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17
+
+/* ZSTD_c_maxBlockSize
+ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
+ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
+ *
+ * This parameter can be used to set an upper bound on the blocksize
+ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper
+ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make
+ * compressBound() inaccurate). Only currently meant to be used for testing.
+ *
+ */
+#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18
+
+/* ZSTD_c_searchForExternalRepcodes
+ * This parameter affects how zstd parses external sequences, such as sequences
+ * provided through the compressSequences() API or from an external block-level
+ * sequence producer.
+ *
+ * If set to ZSTD_ps_enable, the library will check for repeated offsets in
+ * external sequences, even if those repcodes are not explicitly indicated in
+ * the "rep" field. Note that this is the only way to exploit repcode matches
+ * while using compressSequences() or an external sequence producer, since zstd
+ * currently ignores the "rep" field of external sequences.
+ *
+ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in
+ * external sequences, regardless of whether the "rep" field has been set. This
+ * reduces sequence compression overhead by about 25% while sacrificing some
+ * compression ratio.
+ *
+ * The default value is ZSTD_ps_auto, for which the library will enable/disable
+ * based on compression level.
+ *
+ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is
+ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future.
+ */
+#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19
+
 /*! ZSTD_CCtx_getParameter() :
  *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
  *  and store it into int* value.
@@ -2084,7 +2374,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
  * in the range [dst, dst + pos) MUST not be modified during decompression
  * or you will get data corruption.
  *
- * When this flags is enabled zstd won't allocate an output buffer, because
+ * When this flag is enabled zstd won't allocate an output buffer, because
  * it can write directly to the ZSTD_outBuffer, but it will still allocate
  * an input buffer large enough to fit any compressed block. This will also
  * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
@@ -2137,6 +2427,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
  */
 #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4
 
+/* ZSTD_d_disableHuffmanAssembly
+ * Set to 1 to disable the Huffman assembly implementation.
+ * The default value is 0, which allows zstd to use the Huffman assembly
+ * implementation if available.
+ *
+ * This parameter can be used to disable Huffman assembly at runtime.
+ * If you want to disable it at compile time you can define the macro
+ * ZSTD_DISABLE_ASM.
+ */
+#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5
+
+/* ZSTD_d_maxBlockSize
+ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB).
+ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default.
+ *
+ * Forces the decompressor to reject blocks whose content size is
+ * larger than the configured maxBlockSize. When maxBlockSize is
+ * larger than the windowSize, the windowSize is used instead.
+ * This saves memory on the decoder when you know all blocks are small.
+ *
+ * This option is typically used in conjunction with ZSTD_c_maxBlockSize.
+ *
+ * WARNING: This causes the decoder to reject otherwise valid frames
+ * that have block sizes larger than the configured maxBlockSize.
+ */
+#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6
+
 
 /*! ZSTD_DCtx_setFormat() :
  *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
@@ -2145,6 +2462,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete
  *  such ZSTD_f_zstd1_magicless for example.
  * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
 ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
+ZSTDLIB_STATIC_API
 size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
 
 /*! ZSTD_decompressStream_simpleArgs() :
@@ -2181,6 +2499,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
                          int compressionLevel,
                          unsigned long long pledgedSrcSize);
@@ -2198,17 +2517,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
                      const void* dict, size_t dictSize,
                            int compressionLevel);
 
 /*! ZSTD_initCStream_advanced() :
- * This function is DEPRECATED, and is approximately equivalent to:
+ * This function is DEPRECATED, and is equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
- *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
- *     for ((param, value) : params) {
- *         ZSTD_CCtx_setParameter(zcs, param, value);
- *     }
+ *     ZSTD_CCtx_setParams(zcs, params);
  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
  *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
  *
@@ -2218,6 +2535,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
                     const void* dict, size_t dictSize,
                           ZSTD_parameters params,
@@ -2232,15 +2550,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
 
 /*! ZSTD_initCStream_usingCDict_advanced() :
- *   This function is DEPRECATED, and is approximately equivalent to:
+ *   This function is DEPRECATED, and is equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
- *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
- *     for ((fParam, value) : fParams) {
- *         ZSTD_CCtx_setParameter(zcs, fParam, value);
- *     }
+ *     ZSTD_CCtx_setFParams(zcs, fParams);
  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
  *     ZSTD_CCtx_refCDict(zcs, cdict);
  *
@@ -2250,6 +2566,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
  * This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
                                const ZSTD_CDict* cdict,
                                      ZSTD_frameParameters fParams,
@@ -2264,7 +2581,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
  *       explicitly specified.
  *
  *  start a new frame, using same parameters from previous frame.
- *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
+ *  This is typically useful to skip dictionary loading stage, since it will reuse it in-place.
  *  Note that zcs must be init at least once before using ZSTD_resetCStream().
  *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
  *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
@@ -2274,6 +2591,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
  *  This prototype will generate compilation warnings.
  */
 ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions")
+ZSTDLIB_STATIC_API
 size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
 
 
@@ -2319,8 +2637,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
  *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
  *
  * note: no dictionary will be used if dict == NULL or dictSize < 8
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
  */
+ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
 
 /*!
@@ -2330,8 +2648,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo
  *     ZSTD_DCtx_refDDict(zds, ddict);
  *
  * note : ddict is referenced, it must outlive decompression session
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
  */
+ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
 
 /*!
@@ -2339,18 +2657,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z
  *
  *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
  *
- * re-use decompression parameters from previous init; saves dictionary loading
- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ * reuse decompression parameters from previous init; saves dictionary loading
  */
+ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions")
 ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
 
 
+/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API *********************
+ *
+ * *** OVERVIEW ***
+ * The Block-Level Sequence Producer API allows users to provide their own custom
+ * sequence producer which libzstd invokes to process each block. The produced list
+ * of sequences (literals and matches) is then post-processed by libzstd to produce
+ * valid compressed blocks.
+ *
+ * This block-level offload API is a more granular complement of the existing
+ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers
+ * an easier migration story for applications already integrated with libzstd: the
+ * user application continues to invoke the same compression functions
+ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits
+ * from the specific advantages of the external sequence producer. For example,
+ * the sequence producer could be tuned to take advantage of known characteristics
+ * of the input, to offer better speed / ratio, or could leverage hardware
+ * acceleration not available within libzstd itself.
+ *
+ * See contrib/externalSequenceProducer for an example program employing the
+ * Block-Level Sequence Producer API.
+ *
+ * *** USAGE ***
+ * The user is responsible for implementing a function of type
+ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following
+ * arguments to the user-provided function:
+ *
+ *   - sequenceProducerState: a pointer to a user-managed state for the sequence
+ *     producer.
+ *
+ *   - outSeqs, outSeqsCapacity: an output buffer for the sequence producer.
+ *     outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory
+ *     backing outSeqs is managed by the CCtx.
+ *
+ *   - src, srcSize: an input buffer for the sequence producer to parse.
+ *     srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX.
+ *
+ *   - dict, dictSize: a history buffer, which may be empty, which the sequence
+ *     producer may reference as it parses the src buffer. Currently, zstd will
+ *     always pass dictSize == 0 into external sequence producers, but this will
+ *     change in the future.
+ *
+ *   - compressionLevel: a signed integer representing the zstd compression level
+ *     set by the user for the current operation. The sequence producer may choose
+ *     to use this information to change its compression strategy and speed/ratio
+ *     tradeoff. Note: the compression level does not reflect zstd parameters set
+ *     through the advanced API.
+ *
+ *   - windowSize: a size_t representing the maximum allowed offset for external
+ *     sequences. Note that sequence offsets are sometimes allowed to exceed the
+ *     windowSize if a dictionary is present, see doc/zstd_compression_format.md
+ *     for details.
+ *
+ * The user-provided function shall return a size_t representing the number of
+ * sequences written to outSeqs. This return value will be treated as an error
+ * code if it is greater than outSeqsCapacity. The return value must be non-zero
+ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided
+ * for convenience, but any value greater than outSeqsCapacity will be treated as
+ * an error code.
+ *
+ * If the user-provided function does not return an error code, the sequences
+ * written to outSeqs must be a valid parse of the src buffer. Data corruption may
+ * occur if the parse is not valid. A parse is defined to be valid if the
+ * following conditions hold:
+ *   - The sum of matchLengths and literalLengths must equal srcSize.
+ *   - All sequences in the parse, except for the final sequence, must have
+ *     matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have
+ *     matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0.
+ *   - All offsets must respect the windowSize parameter as specified in
+ *     doc/zstd_compression_format.md.
+ *   - If the final sequence has matchLength == 0, it must also have offset == 0.
+ *
+ * zstd will only validate these conditions (and fail compression if they do not
+ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence
+ * validation has a performance cost.
+ *
+ * If the user-provided function returns an error, zstd will either fall back
+ * to an internal sequence producer or fail the compression operation. The user can
+ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback
+ * cParam. Fallback compression will follow any other cParam settings, such as
+ * compression level, the same as in a normal compression operation.
+ *
+ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F
+ * function by calling
+ *         ZSTD_registerSequenceProducer(cctx,
+ *                                       sequenceProducerState,
+ *                                       sequenceProducer)
+ * This setting will persist until the next parameter reset of the CCtx.
+ *
+ * The sequenceProducerState must be initialized by the user before calling
+ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the
+ * sequenceProducerState.
+ *
+ * *** LIMITATIONS ***
+ * This API is compatible with all zstd compression APIs which respect advanced parameters.
+ * However, there are three limitations:
+ *
+ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported.
+ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level
+ * external sequence producer.
+ *   - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some
+ *     cases (see its documentation for details). Users must explicitly set
+ *     ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external
+ *     sequence producer is registered.
+ *   - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default
+ *     whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should
+ *     check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence
+ *     Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog).
+ *
+ * Second, history buffers are not currently supported. Concretely, zstd will always pass
+ * dictSize == 0 to the external sequence producer (for now). This has two implications:
+ *   - Dictionaries are not currently supported. Compression will *not* fail if the user
+ *     references a dictionary, but the dictionary won't have any effect.
+ *   - Stream history is not currently supported. All advanced compression APIs, including
+ *     streaming APIs, work with external sequence producers, but each block is treated as
+ *     an independent chunk without history from previous blocks.
+ *
+ * Third, multi-threading within a single compression is not currently supported. In other words,
+ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered.
+ * Multi-threading across compressions is fine: simply create one CCtx per thread.
+ *
+ * Long-term, we plan to overcome all three limitations. There is no technical blocker to
+ * overcoming them. It is purely a question of engineering effort.
+ */
+
+#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1))
+
+typedef size_t (*ZSTD_sequenceProducer_F) (
+  void* sequenceProducerState,
+  ZSTD_Sequence* outSeqs, size_t outSeqsCapacity,
+  const void* src, size_t srcSize,
+  const void* dict, size_t dictSize,
+  int compressionLevel,
+  size_t windowSize
+);
+
+/*! ZSTD_registerSequenceProducer() :
+ * Instruct zstd to use a block-level external sequence producer function.
+ *
+ * The sequenceProducerState must be initialized by the caller, and the caller is
+ * responsible for managing its lifetime. This parameter is sticky across
+ * compressions. It will remain set until the user explicitly resets compression
+ * parameters.
+ *
+ * Sequence producer registration is considered to be an "advanced parameter",
+ * part of the "advanced API". This means it will only have an effect on compression
+ * APIs which respect advanced parameters, such as compress2() and compressStream2().
+ * Older compression APIs such as compressCCtx(), which predate the introduction of
+ * "advanced parameters", will ignore any external sequence producer setting.
+ *
+ * The sequence producer can be "cleared" by registering a NULL function pointer. This
+ * removes all limitations described above in the "LIMITATIONS" section of the API docs.
+ *
+ * The user is strongly encouraged to read the full API documentation (above) before
+ * calling this function. */
+ZSTDLIB_STATIC_API void
+ZSTD_registerSequenceProducer(
+  ZSTD_CCtx* cctx,
+  void* sequenceProducerState,
+  ZSTD_sequenceProducer_F sequenceProducer
+);
+
+/*! ZSTD_CCtxParams_registerSequenceProducer() :
+ * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params.
+ * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(),
+ * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx().
+ *
+ * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx()
+ * is required, then this function is for you. Otherwise, you probably don't need it.
+ *
+ * See tests/zstreamtest.c for example usage. */
+ZSTDLIB_STATIC_API void
+ZSTD_CCtxParams_registerSequenceProducer(
+  ZSTD_CCtx_params* params,
+  void* sequenceProducerState,
+  ZSTD_sequenceProducer_F sequenceProducer
+);
+
+
 /* *******************************************************************
-*  Buffer-less and synchronous inner streaming functions
+*  Buffer-less and synchronous inner streaming functions (DEPRECATED)
+*
+*  This API is deprecated, and will be removed in a future version.
+*  It allows streaming (de)compression with user allocated buffers.
+*  However, it is hard to use, and not as well tested as the rest of
+*  our API.
 *
-*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
-*  But it's also a complex one, with several restrictions, documented below.
-*  Prefer normal streaming API for an easier experience.
+*  Please use the normal streaming API instead: ZSTD_compressStream2,
+*  and ZSTD_decompressStream.
+*  If there is functionality that you need, but it doesn't provide,
+*  please open an issue on our GitHub.
 ********************************************************************* */
 
 /*
@@ -2358,11 +2860,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
 
   A ZSTD_CCtx object is required to track streaming operations.
   Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
-  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
+  ZSTD_CCtx object can be reused multiple times within successive compression operations.
 
   Start by initializing a context.
   Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression.
-  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
 
   Then, consume your input using ZSTD_compressContinue().
   There are some important considerations to keep in mind when using this advanced function :
@@ -2380,36 +2881,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
   It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
   Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
 
-  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
+  `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again.
 */
 
 /*=====   Buffer-less streaming compression functions  =====*/
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */
-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
 
+ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.")
+ZSTDLIB_STATIC_API
+size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
 /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */
 ZSTD_DEPRECATED("use advanced API to access custom parameters")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
 ZSTD_DEPRECATED("use advanced API to access custom parameters")
+ZSTDLIB_STATIC_API
 size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
 /*
   Buffer-less streaming decompression (synchronous mode)
 
   A ZSTD_DCtx object is required to track streaming operations.
   Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
-  A ZSTD_DCtx object can be re-used multiple times.
+  A ZSTD_DCtx object can be reused multiple times.
 
   First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
   Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
   Data fragment must be large enough to ensure successful decoding.
  `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
-  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
-           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
+  result  : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+           >0 : `srcSize` is too small, please provide at least result bytes on next attempt.
            errorCode, which can be tested using ZSTD_isError().
 
   It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
@@ -2428,7 +2939,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
 
   The most memory efficient way is to use a round buffer of sufficient size.
   Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
-  which can @return an error code if required value is too large for current system (in 32-bits mode).
+  which can return an error code if required value is too large for current system (in 32-bits mode).
   In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
   up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
   which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
@@ -2448,7 +2959,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
   ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
   ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
 
- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+  result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
   It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
   It can also be an error code, which can be tested with ZSTD_isError().
 
@@ -2471,27 +2982,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_
 */
 
 /*=====   Buffer-less streaming decompression functions  =====*/
-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
-typedef struct {
-    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
-    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
-    unsigned blockSizeMax;
-    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
-    unsigned headerSize;
-    unsigned dictID;
-    unsigned checksumFlag;
-} ZSTD_frameHeader;
 
-/*! ZSTD_getFrameHeader() :
- *  decode Frame Header, or requires larger `srcSize`.
- * @return : 0, `zfhPtr` is correctly filled,
- *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
- *           or an error code, which can be tested using ZSTD_isError() */
-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /*< doesn't consume input */
-/*! ZSTD_getFrameHeader_advanced() :
- *  same as ZSTD_getFrameHeader(),
- *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
 ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
 
 ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
@@ -2502,6 +2993,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
 ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
 /* misc */
+ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.")
 ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
 typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
 ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
@@ -2509,11 +3001,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
 
 
 
-/* ============================ */
-/*       Block level API       */
-/* ============================ */
+/* ========================================= */
+/*       Block level API (DEPRECATED)       */
+/* ========================================= */
 
 /*!
+
+    This API is deprecated in favor of the regular compression API.
+    You can get the frame header down to 2 bytes by setting:
+      - ZSTD_c_format = ZSTD_f_zstd1_magicless
+      - ZSTD_c_contentSizeFlag = 0
+      - ZSTD_c_checksumFlag = 0
+      - ZSTD_c_dictIDFlag = 0
+
+    This API is not as well tested as our normal API, so we recommend not using it.
+    We will be removing it in a future version. If the normal API doesn't provide
+    the functionality you need, please open a GitHub issue.
+
     Block functions produce and decode raw zstd blocks, without frame metadata.
     Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
     But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
@@ -2524,7 +3028,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
     - It is necessary to init context before starting
       + compression : any ZSTD_compressBegin*() variant, including with dictionary
       + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
-      + copyCCtx() and copyDCtx() can be used too
     - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
       + If input is larger than a block size, it's necessary to split input data into multiple blocks
       + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
@@ -2541,11 +3044,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
 */
 
 /*=====   Raw zstd block functions  =====*/
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.")
 ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
 
-
 #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
 
diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
index 20f08c644b71..464c410b2768 100644
--- a/lib/zstd/Makefile
+++ b/lib/zstd/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 # ################################################################
-# Copyright (c) Facebook, Inc.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h
new file mode 100644
index 000000000000..16c3d08e8d1a
--- /dev/null
+++ b/lib/zstd/common/allocations.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This file provides custom allocation primitives
+ */
+
+#define ZSTD_DEPS_NEED_MALLOC
+#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
+
+#include "compiler.h" /* MEM_STATIC */
+#define ZSTD_STATIC_LINKING_ONLY
+#include <linux/zstd.h> /* ZSTD_customMem */
+
+#ifndef ZSTD_ALLOCATIONS_H
+#define ZSTD_ALLOCATIONS_H
+
+/* custom memory allocation functions */
+
+MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc)
+        return customMem.customAlloc(customMem.opaque, size);
+    return ZSTD_malloc(size);
+}
+
+MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc) {
+        /* calloc implemented as malloc+memset;
+         * not as efficient as calloc, but next best guess for custom malloc */
+        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+        ZSTD_memset(ptr, 0, size);
+        return ptr;
+    }
+    return ZSTD_calloc(1, size);
+}
+
+MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
+{
+    if (ptr!=NULL) {
+        if (customMem.customFree)
+            customMem.customFree(customMem.opaque, ptr);
+        else
+            ZSTD_free(ptr);
+    }
+}
+
+#endif /* ZSTD_ALLOCATIONS_H */
diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h
new file mode 100644
index 000000000000..aa3487ec4b6a
--- /dev/null
+++ b/lib/zstd/common/bits.h
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_BITS_H
+#define ZSTD_BITS_H
+
+#include "mem.h"
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val)
+{
+    assert(val != 0);
+    {
+        static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
+                                                30, 22, 20, 15, 25, 17, 4, 8,
+                                                31, 27, 13, 23, 21, 19, 16, 7,
+                                                26, 12, 18, 6, 11, 5, 10, 9};
+        return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27];
+    }
+}
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4)
+        return (unsigned)__builtin_ctz(val);
+#   else
+        return ZSTD_countTrailingZeros32_fallback(val);
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) {
+    assert(val != 0);
+    {
+        static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29,
+                                            11, 14, 16, 18, 22, 25, 3, 30,
+                                            8, 12, 20, 28, 15, 17, 24, 7,
+                                            19, 27, 23, 6, 26, 5, 4, 31};
+        val |= val >> 1;
+        val |= val >> 2;
+        val |= val >> 4;
+        val |= val >> 8;
+        val |= val >> 16;
+        return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27];
+    }
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4)
+        return (unsigned)__builtin_clz(val);
+#   else
+        return ZSTD_countLeadingZeros32_fallback(val);
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4) && defined(__LP64__)
+        return (unsigned)__builtin_ctzll(val);
+#   else
+        {
+            U32 mostSignificantWord = (U32)(val >> 32);
+            U32 leastSignificantWord = (U32)val;
+            if (leastSignificantWord == 0) {
+                return 32 + ZSTD_countTrailingZeros32(mostSignificantWord);
+            } else {
+                return ZSTD_countTrailingZeros32(leastSignificantWord);
+            }
+        }
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val)
+{
+    assert(val != 0);
+#   if (__GNUC__ >= 4)
+        return (unsigned)(__builtin_clzll(val));
+#   else
+        {
+            U32 mostSignificantWord = (U32)(val >> 32);
+            U32 leastSignificantWord = (U32)val;
+            if (mostSignificantWord == 0) {
+                return 32 + ZSTD_countLeadingZeros32(leastSignificantWord);
+            } else {
+                return ZSTD_countLeadingZeros32(mostSignificantWord);
+            }
+        }
+#   endif
+}
+
+MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+            return ZSTD_countTrailingZeros64((U64)val) >> 3;
+        } else {
+            return ZSTD_countTrailingZeros32((U32)val) >> 3;
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+            return ZSTD_countLeadingZeros64((U64)val) >> 3;
+        } else {
+            return ZSTD_countLeadingZeros32((U32)val) >> 3;
+        }
+    }
+}
+
+MEM_STATIC unsigned ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+{
+    assert(val != 0);
+    return 31 - ZSTD_countLeadingZeros32(val);
+}
+
+/* ZSTD_rotateRight_*():
+ * Rotates a bitfield to the right by "count" bits.
+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
+ */
+MEM_STATIC
+U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+    assert(count < 64);
+    count &= 0x3F; /* for fickle pattern recognition */
+    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+}
+
+MEM_STATIC
+U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+    assert(count < 32);
+    count &= 0x1F; /* for fickle pattern recognition */
+    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+}
+
+MEM_STATIC
+U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+    assert(count < 16);
+    count &= 0x0F; /* for fickle pattern recognition */
+    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
+}
+
+#endif /* ZSTD_BITS_H */
diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
index feef3a1b1d60..6a13f1f0f1e8 100644
--- a/lib/zstd/common/bitstream.h
+++ b/lib/zstd/common/bitstream.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * bitstream
  * Part of FSE library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -27,6 +28,7 @@
 #include "compiler.h"       /* UNLIKELY() */
 #include "debug.h"          /* assert(), DEBUGLOG(), RAWLOG() */
 #include "error_private.h"  /* error codes and messages */
+#include "bits.h"           /* ZSTD_highbit32 */
 
 
 /*=========================================
@@ -79,19 +81,20 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
 /*-********************************************
 *  bitStream decoding API (read backward)
 **********************************************/
+typedef size_t BitContainerType;
 typedef struct {
-    size_t   bitContainer;
+    BitContainerType bitContainer;
     unsigned bitsConsumed;
     const char* ptr;
     const char* start;
     const char* limitPtr;
 } BIT_DStream_t;
 
-typedef enum { BIT_DStream_unfinished = 0,
-               BIT_DStream_endOfBuffer = 1,
-               BIT_DStream_completed = 2,
-               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
-               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+typedef enum { BIT_DStream_unfinished = 0,  /* fully refilled */
+               BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */
+               BIT_DStream_completed = 2,   /* bitstream entirely consumed, bit-exact */
+               BIT_DStream_overflow = 3     /* user requested more bits than present in bitstream */
+    } BIT_DStream_status;  /* result of BIT_reloadDStream() */
 
 MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
 MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
@@ -101,7 +104,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
 
 /* Start by invoking BIT_initDStream().
 *  A chunk of the bitStream is then stored into a local register.
-*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType).
 *  You can then retrieve bitFields stored into the local register, **in reverse order**.
 *  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
 *  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
@@ -122,33 +125,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
 MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
 /* faster, but works only if nbBits >= 1 */
 
-
-
-/*-**************************************************************
-*  Internal functions
-****************************************************************/
-MEM_STATIC unsigned BIT_highbit32 (U32 val)
-{
-    assert(val != 0);
-    {
-#   if (__GNUC__ >= 3)   /* Use GCC Intrinsic */
-        return __builtin_clz (val) ^ 31;
-#   else   /* Software version */
-        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
-                                                 11, 14, 16, 18, 22, 25,  3, 30,
-                                                  8, 12, 20, 28, 15, 17, 24,  7,
-                                                 19, 27, 23,  6, 26,  5,  4, 31 };
-        U32 v = val;
-        v |= v >> 1;
-        v |= v >> 2;
-        v |= v >> 4;
-        v |= v >> 8;
-        v |= v >> 16;
-        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
-#   endif
-    }
-}
-
 /*=====    Local Constants   =====*/
 static const unsigned BIT_mask[] = {
     0,          1,         3,         7,         0xF,       0x1F,
@@ -178,6 +154,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
     return 0;
 }
 
+FORCE_INLINE_TEMPLATE size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+{
+    assert(nbBits < BIT_MASK_SIZE);
+    return bitContainer & BIT_mask[nbBits];
+}
+
 /*! BIT_addBits() :
  *  can add up to 31 bits into `bitC`.
  *  Note : does not check for register overflow ! */
@@ -187,7 +169,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
     DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32);
     assert(nbBits < BIT_MASK_SIZE);
     assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
-    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
+    bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos;
     bitC->bitPos += nbBits;
 }
 
@@ -266,35 +248,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
         bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
         bitD->bitContainer = MEM_readLEST(bitD->ptr);
         { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
-          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+          bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
           if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
     } else {
         bitD->ptr   = bitD->start;
         bitD->bitContainer = *(const BYTE*)(bitD->start);
         switch(srcSize)
         {
-        case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+        case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
                 ZSTD_FALLTHROUGH;
 
-        case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+        case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
                 ZSTD_FALLTHROUGH;
 
-        case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+        case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
                 ZSTD_FALLTHROUGH;
 
-        case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
+        case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24;
                 ZSTD_FALLTHROUGH;
 
-        case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
+        case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16;
                 ZSTD_FALLTHROUGH;
 
-        case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
+        case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) <<  8;
                 ZSTD_FALLTHROUGH;
 
         default: break;
         }
         {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
-            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+            bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
             if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
         }
         bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
@@ -303,12 +285,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
     return srcSize;
 }
 
-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
+FORCE_INLINE_TEMPLATE size_t BIT_getUpperBits(BitContainerType bitContainer, U32 const start)
 {
     return bitContainer >> start;
 }
 
-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
+FORCE_INLINE_TEMPLATE size_t BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits)
 {
     U32 const regMask = sizeof(bitContainer)*8 - 1;
     /* if start > regMask, bitstream is corrupted, and result is undefined */
@@ -325,19 +307,13 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c
 #endif
 }
 
-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
-{
-    assert(nbBits < BIT_MASK_SIZE);
-    return bitContainer & BIT_mask[nbBits];
-}
-
 /*! BIT_lookBits() :
  *  Provides next n bits from local register.
  *  local register is not modified.
  *  On 32-bits, maxNbBits==24.
  *  On 64-bits, maxNbBits==56.
  * @return : value extracted */
-MEM_STATIC  FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
+FORCE_INLINE_TEMPLATE size_t BIT_lookBits(const BIT_DStream_t*  bitD, U32 nbBits)
 {
     /* arbitrate between double-shift and shift+mask */
 #if 1
@@ -360,7 +336,7 @@ MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
     return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
 }
 
-MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
 {
     bitD->bitsConsumed += nbBits;
 }
@@ -369,7 +345,7 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
  *  Read (consume) next n bits from local register and update.
  *  Pay attention to not read more than nbBits contained into local register.
  * @return : extracted value. */
-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
+FORCE_INLINE_TEMPLATE size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
 {
     size_t const value = BIT_lookBits(bitD, nbBits);
     BIT_skipBits(bitD, nbBits);
@@ -377,7 +353,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n
 }
 
 /*! BIT_readBitsFast() :
- *  unsafe version; only works only if nbBits >= 1 */
+ *  unsafe version; only works if nbBits >= 1 */
 MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
 {
     size_t const value = BIT_lookBitsFast(bitD, nbBits);
@@ -386,6 +362,21 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
     return value;
 }
 
+/*! BIT_reloadDStream_internal() :
+ *  Simple variant of BIT_reloadDStream(), with two conditions:
+ *  1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8
+ *  2. look window is valid after shifted down : bitD->ptr >= bitD->start
+ */
+MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD)
+{
+    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
+    bitD->ptr -= bitD->bitsConsumed >> 3;
+    assert(bitD->ptr >= bitD->start);
+    bitD->bitsConsumed &= 7;
+    bitD->bitContainer = MEM_readLEST(bitD->ptr);
+    return BIT_DStream_unfinished;
+}
+
 /*! BIT_reloadDStreamFast() :
  *  Similar to BIT_reloadDStream(), but with two differences:
  *  1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
@@ -396,31 +387,35 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
 {
     if (UNLIKELY(bitD->ptr < bitD->limitPtr))
         return BIT_DStream_overflow;
-    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
-    bitD->ptr -= bitD->bitsConsumed >> 3;
-    bitD->bitsConsumed &= 7;
-    bitD->bitContainer = MEM_readLEST(bitD->ptr);
-    return BIT_DStream_unfinished;
+    return BIT_reloadDStream_internal(bitD);
 }
 
 /*! BIT_reloadDStream() :
  *  Refill `bitD` from buffer previously set in BIT_initDStream() .
- *  This function is safe, it guarantees it will not read beyond src buffer.
+ *  This function is safe, it guarantees it will not never beyond src buffer.
  * @return : status of `BIT_DStream_t` internal register.
  *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
 {
-    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
+    /* note : once in overflow mode, a bitstream remains in this mode until it's reset */
+    if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) {
+        static const BitContainerType zeroFilled = 0;
+        bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */
+        /* overflow detected, erroneous scenario or end of stream: no update */
         return BIT_DStream_overflow;
+    }
+
+    assert(bitD->ptr >= bitD->start);
 
     if (bitD->ptr >= bitD->limitPtr) {
-        return BIT_reloadDStreamFast(bitD);
+        return BIT_reloadDStream_internal(bitD);
     }
     if (bitD->ptr == bitD->start) {
+        /* reached end of bitStream => no update */
         if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
         return BIT_DStream_completed;
     }
-    /* start < ptr < limitPtr */
+    /* start < ptr < limitPtr => cautious update */
     {   U32 nbBytes = bitD->bitsConsumed >> 3;
         BIT_DStream_status result = BIT_DStream_unfinished;
         if (bitD->ptr - nbBytes < bitD->start) {
diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
index c42d39faf9bd..508ee25537bb 100644
--- a/lib/zstd/common/compiler.h
+++ b/lib/zstd/common/compiler.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,6 +12,8 @@
 #ifndef ZSTD_COMPILER_H
 #define ZSTD_COMPILER_H
 
+#include <linux/types.h>
+
 #include "portability_macros.h"
 
 /*-*******************************************************
@@ -41,12 +44,15 @@
 */
 #define WIN_CDECL
 
+/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
+#define UNUSED_ATTR __attribute__((unused))
+
 /*
  * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
  * parameters. They must be inlined for the compiler to eliminate the constant
  * branches.
  */
-#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR
 /*
  * HINT_INLINE is used to help the compiler generate better code. It is *not*
  * used for "templates", so it can be tweaked based on the compilers
@@ -61,11 +67,21 @@
 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
 #  define HINT_INLINE static INLINE_KEYWORD
 #else
-#  define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
+#  define HINT_INLINE FORCE_INLINE_TEMPLATE
 #endif
 
-/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
-#define UNUSED_ATTR __attribute__((unused))
+/* "soft" inline :
+ * The compiler is free to select if it's a good idea to inline or not.
+ * The main objective is to silence compiler warnings
+ * when a defined function in included but not used.
+ *
+ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit.
+ * Updating the prefix is probably preferable, but requires a fairly large codemod,
+ * since this name is used everywhere.
+ */
+#ifndef MEM_STATIC  /* already defined in Linux Kernel mem.h */
+#define MEM_STATIC static __inline UNUSED_ATTR
+#endif
 
 /* force no inlining */
 #define FORCE_NOINLINE static __attribute__((__noinline__))
@@ -86,23 +102,24 @@
 #  define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
 #  define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
 #elif defined(__aarch64__)
-#  define PREFETCH_L1(ptr)  __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
-#  define PREFETCH_L2(ptr)  __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
+#  define PREFETCH_L1(ptr)  do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
+#  define PREFETCH_L2(ptr)  do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
 #else
-#  define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
-#  define PREFETCH_L2(ptr) (void)(ptr)  /* disabled */
+#  define PREFETCH_L1(ptr) do { (void)(ptr); } while (0)  /* disabled */
+#  define PREFETCH_L2(ptr) do { (void)(ptr); } while (0)  /* disabled */
 #endif  /* NO_PREFETCH */
 
 #define CACHELINE_SIZE 64
 
-#define PREFETCH_AREA(p, s)  {            \
-    const char* const _ptr = (const char*)(p);  \
-    size_t const _size = (size_t)(s);     \
-    size_t _pos;                          \
-    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
-        PREFETCH_L2(_ptr + _pos);         \
-    }                                     \
-}
+#define PREFETCH_AREA(p, s)                              \
+    do {                                                 \
+        const char* const _ptr = (const char*)(p);       \
+        size_t const _size = (size_t)(s);                \
+        size_t _pos;                                     \
+        for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \
+            PREFETCH_L2(_ptr + _pos);                    \
+        }                                                \
+    } while (0)
 
 /* vectorization
  * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax,
@@ -126,9 +143,9 @@
 #define UNLIKELY(x) (__builtin_expect((x), 0))
 
 #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
-#  define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); }
+#  define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0)
 #else
-#  define ZSTD_UNREACHABLE { assert(0); }
+#  define ZSTD_UNREACHABLE do { assert(0); } while (0)
 #endif
 
 /* disable warnings */
@@ -179,6 +196,85 @@
 *  Sanitizer
 *****************************************************************/
 
+/*
+ * Zstd relies on pointer overflow in its decompressor.
+ * We add this attribute to functions that rely on pointer overflow.
+ */
+#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+#  if __has_attribute(no_sanitize)
+#    if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8
+       /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */
+#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow")))
+#    else
+       /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */
+#      define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow")))
+#    endif
+#  else
+#    define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+#  endif
+#endif
+
+/*
+ * Helper function to perform a wrapped pointer difference without trigging
+ * UBSAN.
+ *
+ * @returns lhs - rhs with wrapping
+ */
+MEM_STATIC
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs)
+{
+    return lhs - rhs;
+}
+
+/*
+ * Helper function to perform a wrapped pointer add without triggering UBSAN.
+ *
+ * @return ptr + add with wrapping
+ */
+MEM_STATIC
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add)
+{
+    return ptr + add;
+}
+
+/*
+ * Helper function to perform a wrapped pointer subtraction without triggering
+ * UBSAN.
+ *
+ * @return ptr - sub with wrapping
+ */
+MEM_STATIC
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub)
+{
+    return ptr - sub;
+}
+
+/*
+ * Helper function to add to a pointer that works around C's undefined behavior
+ * of adding 0 to NULL.
+ *
+ * @returns `ptr + add` except it defines `NULL + 0 == NULL`.
+ */
+MEM_STATIC
+unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add)
+{
+    return add > 0 ? ptr + add : ptr;
+}
+
+/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an
+ * abundance of caution, disable our custom poisoning on mingw. */
+#ifdef __MINGW32__
+#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE
+#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1
+#endif
+#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE
+#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1
+#endif
+#endif
+
 
 
 #endif /* ZSTD_COMPILER_H */
diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h
index 0db7b42407ee..d8319a2bef4c 100644
--- a/lib/zstd/common/cpu.h
+++ b/lib/zstd/common/cpu.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
index bb863c9ea616..8eb6aa9a3b20 100644
--- a/lib/zstd/common/debug.c
+++ b/lib/zstd/common/debug.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * debug
  * Part of FSE library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -21,4 +22,10 @@
 
 #include "debug.h"
 
+#if (DEBUGLEVEL>=2)
+/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a
+ * translation unit is empty. So remove this from Linux kernel builds, but
+ * otherwise just leave it in.
+ */
 int g_debuglevel = DEBUGLEVEL;
+#endif
diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h
index 6dd88d1fbd02..226ba3c57ec3 100644
--- a/lib/zstd/common/debug.h
+++ b/lib/zstd/common/debug.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * debug
  * Part of FSE library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -82,18 +83,27 @@ extern int g_debuglevel; /* the variable is only declared,
                             It's useful when enabling very verbose levels
                             on selective conditions (such as position in src) */
 
-#  define RAWLOG(l, ...) {                                       \
-                if (l<=g_debuglevel) {                           \
-                    ZSTD_DEBUG_PRINT(__VA_ARGS__);               \
-            }   }
-#  define DEBUGLOG(l, ...) {                                     \
-                if (l<=g_debuglevel) {                           \
-                    ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \
-                    ZSTD_DEBUG_PRINT(" \n");                     \
-            }   }
+#  define RAWLOG(l, ...)                   \
+    do {                                   \
+        if (l<=g_debuglevel) {             \
+            ZSTD_DEBUG_PRINT(__VA_ARGS__); \
+        }                                  \
+    } while (0)
+
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+#define LINE_AS_STRING TOSTRING(__LINE__)
+
+#  define DEBUGLOG(l, ...)                               \
+    do {                                                 \
+        if (l<=g_debuglevel) {                           \
+            ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \
+            ZSTD_DEBUG_PRINT(" \n");                     \
+        }                                                \
+    } while (0)
 #else
-#  define RAWLOG(l, ...)      {}    /* disabled */
-#  define DEBUGLOG(l, ...)    {}    /* disabled */
+#  define RAWLOG(l, ...)   do { } while (0)    /* disabled */
+#  define DEBUGLOG(l, ...) do { } while (0)    /* disabled */
 #endif
 
 
diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
index fef67056f052..6cdd82233fb5 100644
--- a/lib/zstd/common/entropy_common.c
+++ b/lib/zstd/common/entropy_common.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * Common functions of New Generation Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -19,8 +20,8 @@
 #include "error_private.h"       /* ERR_*, ERROR */
 #define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
 #include "fse.h"
-#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
 #include "huf.h"
+#include "bits.h"                /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */
 
 
 /*===   Version   ===*/
@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
 /*-**************************************************************
 *  FSE NCount encoding-decoding
 ****************************************************************/
-static U32 FSE_ctz(U32 val)
-{
-    assert(val != 0);
-    {
-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
-        return __builtin_ctz(val);
-#   else   /* Software version */
-        U32 count = 0;
-        while ((val & 1) == 0) {
-            val >>= 1;
-            ++count;
-        }
-        return count;
-#   endif
-    }
-}
-
 FORCE_INLINE_TEMPLATE
 size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
                            const void* headerBuffer, size_t hbSize)
@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
              * repeat.
              * Avoid UB by setting the high bit to 1.
              */
-            int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
+            int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
             while (repeats >= 12) {
                 charnum += 3 * 12;
                 if (LIKELY(ip <= iend-7)) {
@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
                     ip = iend - 4;
                 }
                 bitStream = MEM_readLE32(ip) >> bitCount;
-                repeats = FSE_ctz(~bitStream | 0x80000000) >> 1;
+                repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1;
             }
             charnum += 3 * repeats;
             bitStream >>= 2 * repeats;
@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne
                  * know that threshold > 1.
                  */
                 if (remaining <= 1) break;
-                nbBits = BIT_highbit32(remaining) + 1;
+                nbBits = ZSTD_highbit32(remaining) + 1;
                 threshold = 1 << (nbBits - 1);
             }
             if (charnum >= maxSV1) break;
@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
                      const void* src, size_t srcSize)
 {
     U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
-    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
+    return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0);
 }
 
 FORCE_INLINE_TEMPLATE size_t
@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
     if (weightTotal == 0) return ERROR(corruption_detected);
 
     /* get last non-null symbol weight (implied, total must be 2^n) */
-    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
+    {   U32 const tableLog = ZSTD_highbit32(weightTotal) + 1;
         if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
         *tableLogPtr = tableLog;
         /* determine last weight */
         {   U32 const total = 1 << tableLog;
             U32 const rest = total - weightTotal;
-            U32 const verif = 1 << BIT_highbit32(rest);
-            U32 const lastWeight = BIT_highbit32(rest) + 1;
+            U32 const verif = 1 << ZSTD_highbit32(rest);
+            U32 const lastWeight = ZSTD_highbit32(rest) + 1;
             if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
             huffWeight[oSize] = (BYTE)lastWeight;
             rankStats[lastWeight]++;
@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats,
                      U32* nbSymbolsPtr, U32* tableLogPtr,
                      const void* src, size_t srcSize,
                      void* workSpace, size_t wkspSize,
-                     int bmi2)
+                     int flags)
 {
 #if DYNAMIC_BMI2
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
         return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
     }
 #endif
-    (void)bmi2;
+    (void)flags;
     return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
 }
diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c
index 6d1135f8c373..a4062d30d170 100644
--- a/lib/zstd/common/error_private.c
+++ b/lib/zstd/common/error_private.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code)
     case PREFIX(version_unsupported): return "Version not supported";
     case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
     case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
-    case PREFIX(corruption_detected): return "Corrupted block detected";
+    case PREFIX(corruption_detected): return "Data corruption detected";
     case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
+    case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification";
     case PREFIX(parameter_unsupported): return "Unsupported parameter";
+    case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters";
     case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
     case PREFIX(init_missing): return "Context should be init first";
     case PREFIX(memory_allocation): return "Allocation error : not enough memory";
@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code)
     case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
     case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
     case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
+    case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected";
     case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
     case PREFIX(dictionary_wrong): return "Dictionary mismatch";
     case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
     case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
     case PREFIX(srcSize_wrong): return "Src size is incorrect";
     case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
+    case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full";
+    case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty";
         /* following error codes are not stable and may be removed or changed in a future version */
     case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
     case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
     case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
     case PREFIX(srcBuffer_wrong): return "Source buffer is wrong";
+    case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code";
+    case PREFIX(externalSequences_invalid): return "External sequences are not valid";
     case PREFIX(maxCode):
     default: return notErrorCode;
     }
diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h
index ca5101e542fa..0410ca415b54 100644
--- a/lib/zstd/common/error_private.h
+++ b/lib/zstd/common/error_private.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -49,8 +50,13 @@ ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
 ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
 
 /* check and forward error code */
-#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
-#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
+#define CHECK_V_F(e, f)     \
+    size_t const e = f;     \
+    do {                    \
+        if (ERR_isError(e)) \
+            return e;       \
+    } while (0)
+#define CHECK_F(f)   do { CHECK_V_F(_var_err__, f); } while (0)
 
 
 /*-****************************************
@@ -84,10 +90,12 @@ void _force_has_format_string(const char *format, ...) {
  * We want to force this function invocation to be syntactically correct, but
  * we don't want to force runtime evaluation of its arguments.
  */
-#define _FORCE_HAS_FORMAT_STRING(...) \
-  if (0) { \
-    _force_has_format_string(__VA_ARGS__); \
-  }
+#define _FORCE_HAS_FORMAT_STRING(...)              \
+    do {                                           \
+        if (0) {                                   \
+            _force_has_format_string(__VA_ARGS__); \
+        }                                          \
+    } while (0)
 
 #define ERR_QUOTE(str) #str
 
@@ -98,48 +106,50 @@ void _force_has_format_string(const char *format, ...) {
  * In order to do that (particularly, printing the conditional that failed),
  * this can't just wrap RETURN_ERROR().
  */
-#define RETURN_ERROR_IF(cond, err, ...) \
-  if (cond) { \
-    RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
-           __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
-    RAWLOG(3, ": " __VA_ARGS__); \
-    RAWLOG(3, "\n"); \
-    return ERROR(err); \
-  }
+#define RETURN_ERROR_IF(cond, err, ...)                                        \
+    do {                                                                       \
+        if (cond) {                                                            \
+            RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s",          \
+                  __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
+            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                             \
+            RAWLOG(3, ": " __VA_ARGS__);                                       \
+            RAWLOG(3, "\n");                                                   \
+            return ERROR(err);                                                 \
+        }                                                                      \
+    } while (0)
 
 /*
  * Unconditionally return the specified error.
  *
  * In debug modes, prints additional information.
  */
-#define RETURN_ERROR(err, ...) \
-  do { \
-    RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
-           __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \
-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
-    RAWLOG(3, ": " __VA_ARGS__); \
-    RAWLOG(3, "\n"); \
-    return ERROR(err); \
-  } while(0);
+#define RETURN_ERROR(err, ...)                                               \
+    do {                                                                     \
+        RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
+              __FILE__, __LINE__, ERR_QUOTE(ERROR(err)));                    \
+        _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                               \
+        RAWLOG(3, ": " __VA_ARGS__);                                         \
+        RAWLOG(3, "\n");                                                     \
+        return ERROR(err);                                                   \
+    } while(0)
 
 /*
  * If the provided expression evaluates to an error code, returns that error code.
  *
  * In debug modes, prints additional information.
  */
-#define FORWARD_IF_ERROR(err, ...) \
-  do { \
-    size_t const err_code = (err); \
-    if (ERR_isError(err_code)) { \
-      RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
-             __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
-      _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
-      RAWLOG(3, ": " __VA_ARGS__); \
-      RAWLOG(3, "\n"); \
-      return err_code; \
-    } \
-  } while(0);
+#define FORWARD_IF_ERROR(err, ...)                                                 \
+    do {                                                                           \
+        size_t const err_code = (err);                                             \
+        if (ERR_isError(err_code)) {                                               \
+            RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s",                 \
+                  __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \
+            _FORCE_HAS_FORMAT_STRING(__VA_ARGS__);                                 \
+            RAWLOG(3, ": " __VA_ARGS__);                                           \
+            RAWLOG(3, "\n");                                                       \
+            return err_code;                                                       \
+        }                                                                          \
+    } while(0)
 
 
 #endif /* ERROR_H_MODULE */
diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
index 4507043b2287..2185a578617d 100644
--- a/lib/zstd/common/fse.h
+++ b/lib/zstd/common/fse.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * FSE : Finite State Entropy codec
  * Public Prototypes declaration
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -50,34 +51,6 @@
 FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /*< library version number; to be used when checking dll version */
 
 
-/*-****************************************
-*  FSE simple functions
-******************************************/
-/*! FSE_compress() :
-    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
-    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
-    @return : size of compressed data (<= dstCapacity).
-    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
-                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
-*/
-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
-                             const void* src, size_t srcSize);
-
-/*! FSE_decompress():
-    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
-    into already allocated destination buffer 'dst', of size 'dstCapacity'.
-    @return : size of regenerated data (<= maxDstSize),
-              or an error code, which can be tested using FSE_isError() .
-
-    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
-    Why ? : making this distinction requires a header.
-    Header management is intentionally delegated to the user layer, which can better manage special cases.
-*/
-FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
-                               const void* cSrc, size_t cSrcSize);
-
-
 /*-*****************************************
 *  Tool functions
 ******************************************/
@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return
 FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
 
 
-/*-*****************************************
-*  FSE advanced functions
-******************************************/
-/*! FSE_compress2() :
-    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
-    Both parameters can be defined as '0' to mean : use default value
-    @return : size of compressed data
-    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
-                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
-                     if FSE_isError(return), it's an error code.
-*/
-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
-
-
 /*-*****************************************
 *  FSE detailed API
 ******************************************/
@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
 /*! Constructor and Destructor of FSE_CTable.
     Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
 typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
-FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
 
 /*! FSE_buildCTable():
     Builds `ct`, which must be already allocated, using FSE_createCTable().
@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter,
                            unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
                            const void* rBuffer, size_t rBuffSize, int bmi2);
 
-/*! Constructor and Destructor of FSE_DTable.
-    Note that its size depends on 'tableLog' */
 typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
-FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
-
-/*! FSE_buildDTable():
-    Builds 'dt', which must be already allocated, using FSE_createDTable().
-    return : 0, or an errorCode, which can be tested using FSE_isError() */
-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
-
-/*! FSE_decompress_usingDTable():
-    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
-    into `dst` which must be already allocated.
-    @return : size of regenerated data (necessarily <= `dstCapacity`),
-              or an errorCode, which can be tested using FSE_isError() */
-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
 
 /*!
 Tutorial :
@@ -286,6 +227,7 @@ If there is an error, the function will return an error code, which can be teste
 
 #endif  /* FSE_H */
 
+
 #if !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
 #define FSE_H_FSE_STATIC_LINKING_ONLY
 
@@ -317,16 +259,6 @@ If there is an error, the function will return an error code, which can be teste
 unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
 /*< same as FSE_optimalTableLog(), which used `minus==2` */
 
-/* FSE_compress_wksp() :
- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
- */
-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
-
-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
-
 size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
 /*< build a fake FSE_CTable, designed to compress always the same symbolValue */
 
@@ -344,19 +276,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi
 FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
 /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */
 
-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
-
-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */
-
-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
+#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1)
 #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned))
-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize);
-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */
-
 size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2);
-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */
+/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`.
+ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */
 
 typedef enum {
    FSE_repeat_none,  /*< Cannot use the previous table */
@@ -539,20 +463,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, un
     FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
     const U16* const stateTable = (const U16*)(statePtr->stateTable);
     U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
-    BIT_addBits(bitC, statePtr->value, nbBitsOut);
+    BIT_addBits(bitC,  (size_t)statePtr->value, nbBitsOut);
     statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
 }
 
 MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
 {
-    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+    BIT_addBits(bitC, (size_t)statePtr->value, statePtr->stateLog);
     BIT_flushBits(bitC);
 }
 
 
 /* FSE_getMaxNbBits() :
  * Approximate maximum cost of a symbol, in bits.
- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
  * note 1 : assume symbolValue is valid (<= maxSymbolValue)
  * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
 MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
index 8dcb8ca39767..3a17e84f27bf 100644
--- a/lib/zstd/common/fse_decompress.c
+++ b/lib/zstd/common/fse_decompress.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * FSE : Finite State Entropy decoder
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -22,8 +23,8 @@
 #define FSE_STATIC_LINKING_ONLY
 #include "fse.h"
 #include "error_private.h"
-#define ZSTD_DEPS_NEED_MALLOC
-#include "zstd_deps.h"
+#include "zstd_deps.h"  /* ZSTD_memcpy */
+#include "bits.h"       /* ZSTD_highbit32 */
 
 
 /* **************************************************************
@@ -55,19 +56,6 @@
 #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
 #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
 
-
-/* Function templates */
-FSE_DTable* FSE_createDTable (unsigned tableLog)
-{
-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
-    return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
-}
-
-void FSE_freeDTable (FSE_DTable* dt)
-{
-    ZSTD_free(dt);
-}
-
 static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
 {
     void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
@@ -96,7 +84,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
                     symbolNext[s] = 1;
                 } else {
                     if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
-                    symbolNext[s] = normalizedCounter[s];
+                    symbolNext[s] = (U16)normalizedCounter[s];
         }   }   }
         ZSTD_memcpy(dt, &DTableH, sizeof(DTableH));
     }
@@ -111,8 +99,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
          * all symbols have counts <= 8. We ensure we have 8 bytes at the end of
          * our buffer to handle the over-write.
          */
-        {
-            U64 const add = 0x0101010101010101ull;
+        {   U64 const add = 0x0101010101010101ull;
             size_t pos = 0;
             U64 sv = 0;
             U32 s;
@@ -123,14 +110,13 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
                 for (i = 8; i < n; i += 8) {
                     MEM_write64(spread + pos + i, sv);
                 }
-                pos += n;
-            }
-        }
+                pos += (size_t)n;
+        }   }
         /* Now we spread those positions across the table.
-         * The benefit of doing it in two stages is that we avoid the the
+         * The benefit of doing it in two stages is that we avoid the
          * variable size inner loop, which caused lots of branch misses.
          * Now we can run through all the positions without any branch misses.
-         * We unroll the loop twice, since that is what emperically worked best.
+         * We unroll the loop twice, since that is what empirically worked best.
          */
         {
             size_t position = 0;
@@ -166,7 +152,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo
         for (u=0; u<tableSize; u++) {
             FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
             U32 const nextState = symbolNext[symbol]++;
-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
             tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
     }   }
 
@@ -184,49 +170,6 @@ size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsi
 /*-*******************************************************
 *  Decompression (Byte symbols)
 *********************************************************/
-size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
-{
-    void* ptr = dt;
-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
-    void* dPtr = dt + 1;
-    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
-
-    DTableH->tableLog = 0;
-    DTableH->fastMode = 0;
-
-    cell->newState = 0;
-    cell->symbol = symbolValue;
-    cell->nbBits = 0;
-
-    return 0;
-}
-
-
-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
-{
-    void* ptr = dt;
-    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
-    void* dPtr = dt + 1;
-    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
-    const unsigned tableSize = 1 << nbBits;
-    const unsigned tableMask = tableSize - 1;
-    const unsigned maxSV1 = tableMask+1;
-    unsigned s;
-
-    /* Sanity checks */
-    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
-
-    /* Build Decoding Table */
-    DTableH->tableLog = (U16)nbBits;
-    DTableH->fastMode = 1;
-    for (s=0; s<maxSV1; s++) {
-        dinfo[s].newState = 0;
-        dinfo[s].symbol = (BYTE)s;
-        dinfo[s].nbBits = (BYTE)nbBits;
-    }
-
-    return 0;
-}
 
 FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
           void* dst, size_t maxDstSize,
@@ -287,32 +230,12 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
             break;
     }   }
 
-    return op-ostart;
-}
-
-
-size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
-                            const void* cSrc, size_t cSrcSize,
-                            const FSE_DTable* dt)
-{
-    const void* ptr = dt;
-    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
-    const U32 fastMode = DTableH->fastMode;
-
-    /* select fast mode (static) */
-    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
-    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
-}
-
-
-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize)
-{
-    return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0);
+    assert(op >= ostart);
+    return (size_t)(op-ostart);
 }
 
 typedef struct {
     short ncount[FSE_MAX_SYMBOL_VALUE + 1];
-    FSE_DTable dtable[]; /* Dynamically sized */
 } FSE_DecompressWksp;
 
 
@@ -327,13 +250,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
     unsigned tableLog;
     unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
     FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace;
+    size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable);
+    FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos;
 
-    DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
+    FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0);
     if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC);
 
+    /* correct offset to dtable depends on this property */
+    FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0);
+
     /* normal FSE decoding mode */
-    {
-        size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
+    {   size_t const NCountLength =
+            FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2);
         if (FSE_isError(NCountLength)) return NCountLength;
         if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
         assert(NCountLength <= cSrcSize);
@@ -342,19 +270,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body(
     }
 
     if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge);
-    workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog);
+    assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize);
+    workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
     wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog);
 
-    CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
+    CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) );
 
     {
-        const void* ptr = wksp->dtable;
+        const void* ptr = dtable;
         const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
         const U32 fastMode = DTableH->fastMode;
 
         /* select fast mode (static) */
-        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1);
-        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0);
+        if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1);
+        return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0);
     }
 }
 
@@ -382,9 +311,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc,
     return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize);
 }
 
-
-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
-
-
-
 #endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
index 5042ff870308..57462466e188 100644
--- a/lib/zstd/common/huf.h
+++ b/lib/zstd/common/huf.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * huff0 huffman codec,
  * part of Finite State Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * You can contact the author at :
  * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -18,99 +19,22 @@
 
 /* *** Dependencies *** */
 #include "zstd_deps.h"    /* size_t */
-
-
-/* *** library symbols visibility *** */
-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
- *        HUF symbols remain "private" (internal symbols for library only).
- *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
-#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
-#  define HUF_PUBLIC_API __declspec(dllexport)
-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
-#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
-#else
-#  define HUF_PUBLIC_API
-#endif
-
-
-/* ========================== */
-/* ***  simple functions  *** */
-/* ========================== */
-
-/* HUF_compress() :
- *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
- * 'dst' buffer must be already allocated.
- *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
- * @return : size of compressed data (<= `dstCapacity`).
- *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
- *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
- */
-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
-                             const void* src, size_t srcSize);
-
-/* HUF_decompress() :
- *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
- *  into already allocated buffer 'dst', of minimum size 'dstSize'.
- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
- *  Note : in contrast with FSE, HUF_decompress can regenerate
- *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
- *         because it knows size to regenerate (originalSize).
- * @return : size of regenerated data (== originalSize),
- *           or an error code, which can be tested using HUF_isError()
- */
-HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
-                               const void* cSrc, size_t cSrcSize);
+#include "mem.h"          /* U32 */
+#define FSE_STATIC_LINKING_ONLY
+#include "fse.h"
 
 
 /* ***   Tool functions *** */
-#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /*< maximum input size for a single block compressed with HUF_compress */
-HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
+#define HUF_BLOCKSIZE_MAX (128 * 1024)   /*< maximum input size for a single block compressed with HUF_compress */
+size_t HUF_compressBound(size_t size);   /*< maximum compressed size (worst case) */
 
 /* Error Management */
-HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
-
+unsigned    HUF_isError(size_t code);       /*< tells if a return value is an error code */
+const char* HUF_getErrorName(size_t code);  /*< provides error code string (useful for debugging) */
 
-/* ***   Advanced function   *** */
 
-/* HUF_compress2() :
- *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
-                               const void* src, size_t srcSize,
-                               unsigned maxSymbolValue, unsigned tableLog);
-
-/* HUF_compress4X_wksp() :
- *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
 #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
 #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
-                                     const void* src, size_t srcSize,
-                                     unsigned maxSymbolValue, unsigned tableLog,
-                                     void* workSpace, size_t wkspSize);
-
-#endif   /* HUF_H_298734234 */
-
-/* ******************************************************************
- *  WARNING !!
- *  The following section contains advanced and experimental definitions
- *  which shall never be used in the context of a dynamic library,
- *  because they are not guaranteed to remain stable in the future.
- *  Only consider them in association with static linking.
- * *****************************************************************/
-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
-#define HUF_H_HUF_STATIC_LINKING_ONLY
-
-/* *** Dependencies *** */
-#include "mem.h"   /* U32 */
-#define FSE_STATIC_LINKING_ONLY
-#include "fse.h"
-
 
 /* *** Constants *** */
 #define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
@@ -151,25 +75,49 @@ typedef U32 HUF_DTable;
 /* ****************************************
 *  Advanced decompression functions
 ******************************************/
-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
-#endif
 
-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< decodes RLE and uncompressed */
-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */
-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */
-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
-#endif
+/*
+ * Huffman flags bitset.
+ * For all flags, 0 is the default value.
+ */
+typedef enum {
+    /*
+     * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime.
+     * Otherwise: Ignored.
+     */
+    HUF_flags_bmi2 = (1 << 0),
+    /*
+     * If set: Test possible table depths to find the one that produces the smallest header + encoded size.
+     * If unset: Use heuristic to find the table depth.
+     */
+    HUF_flags_optimalDepth = (1 << 1),
+    /*
+     * If set: If the previous table can encode the input, always reuse the previous table.
+     * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output.
+     */
+    HUF_flags_preferRepeat = (1 << 2),
+    /*
+     * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress.
+     * If unset: Always histogram the entire input.
+     */
+    HUF_flags_suspectUncompressible = (1 << 3),
+    /*
+     * If set: Don't use assembly implementations
+     * If unset: Allow using assembly implementations
+     */
+    HUF_flags_disableAsm = (1 << 4),
+    /*
+     * If set: Don't use the fast decoding loop, always use the fallback decoding loop.
+     * If unset: Use the fast decoding loop when possible.
+     */
+    HUF_flags_disableFast = (1 << 5)
+} HUF_flags_e;
 
 
 /* ****************************************
  *  HUF detailed API
  * ****************************************/
+#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra
 
 /*! HUF_compress() does the following:
  *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
  *  For example, it's possible to compress several blocks using the same 'CTable',
  *  or to save and regenerate 'CTable' using external methods.
  */
-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
+unsigned HUF_minTableLog(unsigned symbolCardinality);
+unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue);
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace,
+ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */
 size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize);
-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
 size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
 int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
 
@@ -196,6 +144,7 @@ typedef enum {
    HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
    HUF_repeat_valid  /*< Can use the previous table and it is assumed to be valid */
  } HUF_repeat;
+
 /* HUF_compress4X_repeat() :
  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
  *  If it uses hufTable it does not modify hufTable or repeat.
@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
                        const void* src, size_t srcSize,
                        unsigned maxSymbolValue, unsigned tableLog,
                        void* workSpace, size_t wkspSize,    /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
 
 /* HUF_buildCTable_wksp() :
  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
  * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
  */
-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
+#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192)
 #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
 size_t HUF_buildCTable_wksp (HUF_CElt* tree,
                        const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize,
                           U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
                           const void* src, size_t srcSize,
                           void* workspace, size_t wkspSize,
-                          int bmi2);
+                          int flags);
 
 /* HUF_readCTable() :
  *  Loading a CTable saved with HUF_writeCTable() */
@@ -246,9 +195,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
 
 /* HUF_getNbBitsFromCTable() :
  *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
- *  Note 1 : is not inlined, as HUF_CElt definition is private */
+ *  Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0
+ *  Note 2 : is not inlined, as HUF_CElt definition is private
+ */
 U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
 
+typedef struct {
+    BYTE tableLog;
+    BYTE maxSymbolValue;
+    BYTE unused[sizeof(size_t) - 2];
+} HUF_CTableHeader;
+
+/* HUF_readCTableHeader() :
+ *  @returns The header from the CTable specifying the tableLog and the maxSymbolValue.
+ */
+HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable);
+
 /*
  * HUF_decompress() does the following:
  * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
@@ -276,32 +238,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
 #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9))
 #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
 
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
-#endif
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
-#endif
-
-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#endif
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#endif
-
 
 /* ====================== */
 /* single stream variants */
 /* ====================== */
 
-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */
-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags);
 /* HUF_compress1X_repeat() :
  *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
  *  If it uses hufTable it does not modify hufTable or repeat.
@@ -312,47 +254,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
                        const void* src, size_t srcSize,
                        unsigned maxSymbolValue, unsigned tableLog,
                        void* workSpace, size_t wkspSize,   /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible);
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int flags);
 
-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
-#endif
-
-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< single-symbol decoder */
-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< single-symbol decoder */
-#endif
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
 #ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /*< double-symbols decoder */
-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /*< double-symbols decoder */
-#endif
-
-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /*< automatic selection of sing or double symbol decoder, based on DTable */
-#ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
-#endif
-#ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);   /*< double-symbols decoder */
 #endif
 
 /* BMI2 variants.
  * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
  */
-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
 #ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
 #endif
-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags);
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags);
 #ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
 #endif
 #ifndef HUF_FORCE_DECOMPRESS_X1
-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags);
 #endif
 
-#endif /* HUF_STATIC_LINKING_ONLY */
+#endif   /* HUF_H_298734234 */
 
diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
index 1d9cc03924ca..2e91e7780c1f 100644
--- a/lib/zstd/common/mem.h
+++ b/lib/zstd/common/mem.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -24,6 +24,7 @@
 /*-****************************************
 *  Compiler specifics
 ******************************************/
+#undef MEM_STATIC /* may be already defined from common/compiler.h */
 #define MEM_STATIC static inline
 
 /*-**************************************************************
diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h
index 0e3b2c0a527d..f08638cced6c 100644
--- a/lib/zstd/common/portability_macros.h
+++ b/lib/zstd/common/portability_macros.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -12,7 +13,7 @@
 #define ZSTD_PORTABILITY_MACROS_H
 
 /*
- * This header file contains macro defintions to support portability.
+ * This header file contains macro definitions to support portability.
  * This header is shared between C and ASM code, so it MUST only
  * contain macro definitions. It MUST not contain any C code.
  *
@@ -45,6 +46,8 @@
 /* Mark the internal assembly functions as hidden  */
 #ifdef __ELF__
 # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func
+#elif defined(__APPLE__)
+# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func
 #else
 # define ZSTD_HIDE_ASM_FUNCTION(func)
 #endif
@@ -65,7 +68,7 @@
 #endif
 
 /*
- * Only enable assembly for GNUC comptabile compilers,
+ * Only enable assembly for GNUC compatible compilers,
  * because other platforms may not support GAS assembly syntax.
  *
  * Only enable assembly for Linux / MacOS, other platforms may
@@ -90,4 +93,23 @@
  */
 #define ZSTD_ENABLE_ASM_X86_64_BMI2 0
 
+/*
+ * For x86 ELF targets, add .note.gnu.property section for Intel CET in
+ * assembly sources when CET is enabled.
+ *
+ * Additionally, any function that may be called indirectly must begin
+ * with ZSTD_CET_ENDBRANCH.
+ */
+#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \
+    && defined(__has_include)
+# if __has_include(<cet.h>)
+#  include <cet.h>
+#  define ZSTD_CET_ENDBRANCH _CET_ENDBR
+# endif
+#endif
+
+#ifndef ZSTD_CET_ENDBRANCH
+# define ZSTD_CET_ENDBRANCH
+#endif
+
 #endif /* ZSTD_PORTABILITY_MACROS_H */
diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
index 3d7e35b309b5..44b95b25344a 100644
--- a/lib/zstd/common/zstd_common.c
+++ b/lib/zstd/common/zstd_common.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,7 +15,6 @@
 *  Dependencies
 ***************************************/
 #define ZSTD_DEPS_NEED_MALLOC
-#include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */
 #include "error_private.h"
 #include "zstd_internal.h"
 
@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
 /*! ZSTD_getErrorString() :
  *  provides error code string from enum */
 const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
-
-
-
-/*=**************************************************************
-*  Custom allocator
-****************************************************************/
-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem)
-{
-    if (customMem.customAlloc)
-        return customMem.customAlloc(customMem.opaque, size);
-    return ZSTD_malloc(size);
-}
-
-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
-{
-    if (customMem.customAlloc) {
-        /* calloc implemented as malloc+memset;
-         * not as efficient as calloc, but next best guess for custom malloc */
-        void* const ptr = customMem.customAlloc(customMem.opaque, size);
-        ZSTD_memset(ptr, 0, size);
-        return ptr;
-    }
-    return ZSTD_calloc(1, size);
-}
-
-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
-{
-    if (ptr!=NULL) {
-        if (customMem.customFree)
-            customMem.customFree(customMem.opaque, ptr);
-        else
-            ZSTD_free(ptr);
-    }
-}
diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
index 2c34e8a33a1c..f931f7d0e294 100644
--- a/lib/zstd/common/zstd_deps.h
+++ b/lib/zstd/common/zstd_deps.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) {
 
 #endif /* ZSTD_DEPS_IO */
 #endif /* ZSTD_DEPS_NEED_IO */
+
+/*
+ * Only requested when MSAN is enabled.
+ * Need:
+ * intptr_t
+ */
+#ifdef ZSTD_DEPS_NEED_STDINT
+#ifndef ZSTD_DEPS_STDINT
+#define ZSTD_DEPS_STDINT
+
+/* intptr_t already provided by ZSTD_DEPS_COMMON */
+
+#endif /* ZSTD_DEPS_STDINT */
+#endif /* ZSTD_DEPS_NEED_STDINT */
diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
index 93305d9b41bb..11da1233e890 100644
--- a/lib/zstd/common/zstd_internal.h
+++ b/lib/zstd/common/zstd_internal.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -28,7 +29,6 @@
 #include <linux/zstd.h>
 #define FSE_STATIC_LINKING_ONLY
 #include "fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "huf.h"
 #include <linux/xxhash.h>                /* XXH_reset, update, digest */
 #define ZSTD_TRACE 0
@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
 #define ZSTD_FRAMECHECKSUMSIZE 4
 
 #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */)   /* for a non-null block */
+#define MIN_LITERALS_FOR_4_STREAMS 6
 
-#define HufLog 12
 typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
 
 #define LONGNBSEQ 0x7F00
@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
 #define MINMATCH 3
 
 #define Litbits  8
+#define LitHufLog 11
 #define MaxLit ((1<<Litbits) - 1)
 #define MaxML   52
 #define MaxLL   35
@@ -103,6 +104,8 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy
 #define LLFSELog    9
 #define OffFSELog   8
 #define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
+#define MaxMLBits 16
+#define MaxLLBits 16
 
 #define ZSTD_MAX_HUF_HEADER_SIZE 128 /* header + <= 127 byte tree description */
 /* Each table cannot take more than #symbols * FSELog bits */
@@ -166,7 +169,7 @@ static void ZSTD_copy8(void* dst, const void* src) {
     ZSTD_memcpy(dst, src, 8);
 #endif
 }
-#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+#define COPY8(d,s) do { ZSTD_copy8(d,s); d+=8; s+=8; } while (0)
 
 /* Need to use memmove here since the literal buffer can now be located within
    the dst buffer. In circumstances where the op "catches up" to where the
@@ -186,7 +189,7 @@ static void ZSTD_copy16(void* dst, const void* src) {
     ZSTD_memcpy(dst, copy16_buf, 16);
 #endif
 }
-#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
+#define COPY16(d,s) do { ZSTD_copy16(d,s); d+=16; s+=16; } while (0)
 
 #define WILDCOPY_OVERLENGTH 32
 #define WILDCOPY_VECLEN 16
@@ -215,7 +218,7 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
     if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
         /* Handle short offset copies. */
         do {
-            COPY8(op, ip)
+            COPY8(op, ip);
         } while (op < oend);
     } else {
         assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
          * one COPY16() in the first call. Then, do two calls per loop since
          * at that point it is more likely to have a high trip count.
          */
-#ifdef __aarch64__
-        do {
-            COPY16(op, ip);
-        }
-        while (op < oend);
-#else
         ZSTD_copy16(op, ip);
         if (16 >= length) return;
         op += 16;
@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e
             COPY16(op, ip);
         }
         while (op < oend);
-#endif
     }
 }
 
@@ -289,11 +285,11 @@ typedef enum {
 typedef struct {
     seqDef* sequencesStart;
     seqDef* sequences;      /* ptr to end of sequences */
-    BYTE* litStart;
-    BYTE* lit;              /* ptr to end of literals */
-    BYTE* llCode;
-    BYTE* mlCode;
-    BYTE* ofCode;
+    BYTE*  litStart;
+    BYTE*  lit;             /* ptr to end of literals */
+    BYTE*  llCode;
+    BYTE*  mlCode;
+    BYTE*  ofCode;
     size_t maxNbSeq;
     size_t maxNbLit;
 
@@ -301,8 +297,8 @@ typedef struct {
      * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment
      * the existing value of the litLength or matchLength by 0x10000.
      */
-    ZSTD_longLengthType_e   longLengthType;
-    U32                     longLengthPos;  /* Index of the sequence to apply long length modification to */
+    ZSTD_longLengthType_e longLengthType;
+    U32                   longLengthPos;  /* Index of the sequence to apply long length modification to */
 } seqStore_t;
 
 typedef struct {
@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
     seqLen.matchLength = seq->mlBase + MINMATCH;
     if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
         if (seqStore->longLengthType == ZSTD_llt_literalLength) {
-            seqLen.litLength += 0xFFFF;
+            seqLen.litLength += 0x10000;
         }
         if (seqStore->longLengthType == ZSTD_llt_matchLength) {
-            seqLen.matchLength += 0xFFFF;
+            seqLen.matchLength += 0x10000;
         }
     }
     return seqLen;
@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore
  *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
  */
 typedef struct {
+    size_t nbBlocks;
     size_t compressedSize;
     unsigned long long decompressedBound;
 } ZSTD_frameSizeInfo;   /* decompress & legacy */
 
 const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
-
-/* custom memory allocation functions */
-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem);
-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem);
-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem);
-
-
-MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
-{
-    assert(val != 0);
-    {
-#   if (__GNUC__ >= 3)   /* GCC Intrinsic */
-        return __builtin_clz (val) ^ 31;
-#   else   /* Software version */
-        static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
-        U32 v = val;
-        v |= v >> 1;
-        v |= v >> 2;
-        v |= v >> 4;
-        v |= v >> 8;
-        v |= v >> 16;
-        return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
-#   endif
-    }
-}
-
-/*
- * Counts the number of trailing zeros of a `size_t`.
- * Most compilers should support CTZ as a builtin. A backup
- * implementation is provided if the builtin isn't supported, but
- * it may not be terribly efficient.
- */
-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
-{
-    if (MEM_64bits()) {
-#       if (__GNUC__ >= 4)
-            return __builtin_ctzll((U64)val);
-#       else
-            static const int DeBruijnBytePos[64] = {  0,  1,  2,  7,  3, 13,  8, 19,
-                                                      4, 25, 14, 28,  9, 34, 20, 56,
-                                                      5, 17, 26, 54, 15, 41, 29, 43,
-                                                      10, 31, 38, 35, 21, 45, 49, 57,
-                                                      63,  6, 12, 18, 24, 27, 33, 55,
-                                                      16, 53, 40, 42, 30, 37, 44, 48,
-                                                      62, 11, 23, 32, 52, 39, 36, 47,
-                                                      61, 22, 51, 46, 60, 50, 59, 58 };
-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
-#       endif
-    } else { /* 32 bits */
-#       if (__GNUC__ >= 3)
-            return __builtin_ctz((U32)val);
-#       else
-            static const int DeBruijnBytePos[32] = {  0,  1, 28,  2, 29, 14, 24,  3,
-                                                     30, 22, 20, 15, 25, 17,  4,  8,
-                                                     31, 27, 13, 23, 21, 19, 16,  7,
-                                                     26, 12, 18,  6, 11,  5, 10,  9 };
-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
-#       endif
-    }
-}
+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
 
 
 /* ZSTD_invalidateRepCodes() :
@@ -420,13 +357,13 @@ typedef struct {
 
 /*! ZSTD_getcBlockSize() :
  *  Provides the size of compressed block from block header `src` */
-/* Used by: decompress, fullbench (does not get its definition from here) */
+/*  Used by: decompress, fullbench */
 size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
                           blockProperties_t* bpPtr);
 
 /*! ZSTD_decodeSeqHeaders() :
  *  decode sequence header from src */
-/* Used by: decompress, fullbench (does not get its definition from here) */
+/*  Used by: zstd_decompress_block, fullbench */
 size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                        const void* src, size_t srcSize);
 
diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
index d9a76112ec3a..6ab8be6532ef 100644
--- a/lib/zstd/compress/clevels.h
+++ b/lib/zstd/compress/clevels.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c
index ec5b1ca6d71a..44a3c10becf2 100644
--- a/lib/zstd/compress/fse_compress.c
+++ b/lib/zstd/compress/fse_compress.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * FSE : Finite State Entropy encoder
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -25,7 +26,8 @@
 #include "../common/error_private.h"
 #define ZSTD_DEPS_NEED_MALLOC
 #define ZSTD_DEPS_NEED_MATH64
-#include "../common/zstd_deps.h"  /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */
+#include "../common/zstd_deps.h"  /* ZSTD_memset */
+#include "../common/bits.h" /* ZSTD_highbit32 */
 
 
 /* **************************************************************
@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
     assert(tableLog < 16);   /* required for threshold strategy to work */
 
     /* For explanations on how to distribute symbol values over the table :
-     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+     * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
 
      #ifdef __clang_analyzer__
      ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
                 break;
             default :
                 assert(normalizedCounter[s] > 1);
-                {   U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1);
+                {   U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1);
                     U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut;
                     symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
                     symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]);
@@ -224,8 +226,8 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
     size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog
                                    + 4 /* bitCount initialized at 4 */
                                    + 2 /* first two symbols may use one additional bit each */) / 8)
-                                    + 1 /* round up to whole nb bytes */
-                                    + 2 /* additional two bytes for bitstream flush */;
+                                   + 1 /* round up to whole nb bytes */
+                                   + 2 /* additional two bytes for bitstream flush */;
     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
 }
 
@@ -254,7 +256,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
     /* Init */
     remaining = tableSize+1;   /* +1 for extra accuracy */
     threshold = tableSize;
-    nbBits = tableLog+1;
+    nbBits = (int)tableLog+1;
 
     while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
         if (previousIs0) {
@@ -273,7 +275,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
             }
             while (symbol >= start+3) {
                 start+=3;
-                bitStream += 3 << bitCount;
+                bitStream += 3U << bitCount;
                 bitCount += 2;
             }
             bitStream += (symbol-start) << bitCount;
@@ -293,7 +295,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
             count++;   /* +1 for extra accuracy */
             if (count>=threshold)
                 count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
-            bitStream += count << bitCount;
+            bitStream += (U32)count << bitCount;
             bitCount  += nbBits;
             bitCount  -= (count<max);
             previousIs0  = (count==1);
@@ -321,7 +323,8 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize,
     out[1] = (BYTE)(bitStream>>8);
     out+= (bitCount+7) /8;
 
-    return (out-ostart);
+    assert(out >= ostart);
+    return (size_t)(out-ostart);
 }
 
 
@@ -342,21 +345,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize,
 *  FSE Compression Code
 ****************************************************************/
 
-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
-{
-    size_t size;
-    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
-    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
-    return (FSE_CTable*)ZSTD_malloc(size);
-}
-
-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); }
-
 /* provides the minimum logSize to safely represent a distribution */
 static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
 {
-    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
-    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
+    U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1;
+    U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2;
     U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
     assert(srcSize > 1); /* Not supported, RLE should be used instead */
     return minBits;
@@ -364,7 +357,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
 
 unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
 {
-    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
+    U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus;
     U32 tableLog = maxTableLog;
     U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
     assert(srcSize > 1); /* Not supported, RLE should be used instead */
@@ -532,40 +525,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
     return tableLog;
 }
 
-
-/* fake FSE_CTable, for raw (uncompressed) input */
-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
-{
-    const unsigned tableSize = 1 << nbBits;
-    const unsigned tableMask = tableSize - 1;
-    const unsigned maxSymbolValue = tableMask;
-    void* const ptr = ct;
-    U16* const tableU16 = ( (U16*) ptr) + 2;
-    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
-    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
-    unsigned s;
-
-    /* Sanity checks */
-    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
-
-    /* header */
-    tableU16[-2] = (U16) nbBits;
-    tableU16[-1] = (U16) maxSymbolValue;
-
-    /* Build table */
-    for (s=0; s<tableSize; s++)
-        tableU16[s] = (U16)(tableSize + s);
-
-    /* Build Symbol Transformation Table */
-    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
-        for (s=0; s<=maxSymbolValue; s++) {
-            symbolTT[s].deltaNbBits = deltaNbBits;
-            symbolTT[s].deltaFindState = s-1;
-    }   }
-
-    return 0;
-}
-
 /* fake FSE_CTable, for rle input (always same symbol) */
 size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
 {
@@ -664,5 +623,4 @@ size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
 
 size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
 
-
 #endif   /* FSE_COMMONDEFS_ONLY */
diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c
index 3ddc6dfb6894..0b12587cc14b 100644
--- a/lib/zstd/compress/hist.c
+++ b/lib/zstd/compress/hist.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * hist : Histogram functions
  * part of Finite State Entropy project
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h
index fc1830abc9c6..f7687b0fc20a 100644
--- a/lib/zstd/compress/hist.h
+++ b/lib/zstd/compress/hist.h
@@ -1,7 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /* ******************************************************************
  * hist : Histogram functions
  * part of Finite State Entropy project
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c
index 74ef0db47621..0b229f5d2ae2 100644
--- a/lib/zstd/compress/huf_compress.c
+++ b/lib/zstd/compress/huf_compress.c
@@ -1,6 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * Huffman encoder, part of New Generation Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -26,9 +27,9 @@
 #include "hist.h"
 #define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
 #include "../common/fse.h"        /* header compression */
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "../common/error_private.h"
+#include "../common/bits.h"       /* ZSTD_highbit32 */
 
 
 /* **************************************************************
@@ -39,13 +40,67 @@
 
 
 /* **************************************************************
-*  Utils
+*  Required declarations
 ****************************************************************/
-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
+
+
+/* **************************************************************
+*  Debug Traces
+****************************************************************/
+
+#if DEBUGLEVEL >= 2
+
+static size_t showU32(const U32* arr, size_t size)
 {
-    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", arr[u]); (void)arr;
+    }
+    RAWLOG(6, " \n");
+    return size;
 }
 
+static size_t HUF_getNbBits(HUF_CElt elt);
+
+static size_t showCTableBits(const HUF_CElt* ctable, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %zu", HUF_getNbBits(ctable[u])); (void)ctable;
+    }
+    RAWLOG(6, " \n");
+    return size;
+
+}
+
+static size_t showHNodeSymbols(const nodeElt* hnode, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", hnode[u].byte); (void)hnode;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+static size_t showHNodeBits(const nodeElt* hnode, size_t size)
+{
+    size_t u;
+    for (u=0; u<size; u++) {
+        RAWLOG(6, " %u", hnode[u].nbBits); (void)hnode;
+    }
+    RAWLOG(6, " \n");
+    return size;
+}
+
+#endif
+
 
 /* *******************************************************
 *  HUF : Huffman block compression
@@ -86,7 +141,10 @@ typedef struct {
     S16 norm[HUF_TABLELOG_MAX+1];
 } HUF_CompressWeightsWksp;
 
-static size_t HUF_compressWeights(void* dst, size_t dstSize, const void* weightTable, size_t wtSize, void* workspace, size_t workspaceSize)
+static size_t
+HUF_compressWeights(void* dst, size_t dstSize,
+              const void* weightTable, size_t wtSize,
+                    void* workspace, size_t workspaceSize)
 {
     BYTE* const ostart = (BYTE*) dst;
     BYTE* op = ostart;
@@ -137,7 +195,7 @@ static size_t HUF_getNbBitsFast(HUF_CElt elt)
 
 static size_t HUF_getValue(HUF_CElt elt)
 {
-    return elt & ~0xFF;
+    return elt & ~(size_t)0xFF;
 }
 
 static size_t HUF_getValueFast(HUF_CElt elt)
@@ -160,6 +218,25 @@ static void HUF_setValue(HUF_CElt* elt, size_t value)
     }
 }
 
+HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable)
+{
+    HUF_CTableHeader header;
+    ZSTD_memcpy(&header, ctable, sizeof(header));
+    return header;
+}
+
+static void HUF_writeCTableHeader(HUF_CElt* ctable, U32 tableLog, U32 maxSymbolValue)
+{
+    HUF_CTableHeader header;
+    HUF_STATIC_ASSERT(sizeof(ctable[0]) == sizeof(header));
+    ZSTD_memset(&header, 0, sizeof(header));
+    assert(tableLog < 256);
+    header.tableLog = (BYTE)tableLog;
+    assert(maxSymbolValue < 256);
+    header.maxSymbolValue = (BYTE)maxSymbolValue;
+    ZSTD_memcpy(ctable, &header, sizeof(header));
+}
+
 typedef struct {
     HUF_CompressWeightsWksp wksp;
     BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
@@ -175,6 +252,11 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
     U32 n;
     HUF_WriteCTableWksp* wksp = (HUF_WriteCTableWksp*)HUF_alignUpWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
 
+    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE >= sizeof(HUF_WriteCTableWksp));
+
+    assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue);
+    assert(HUF_readCTableHeader(CTable).tableLog == huffLog);
+
     /* check conditions */
     if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
@@ -204,16 +286,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize,
     return ((maxSymbolValue+1)/2) + 1;
 }
 
-/*! HUF_writeCTable() :
-    `CTable` : Huffman tree to save, using huf representation.
-    @return : size of saved CTable */
-size_t HUF_writeCTable (void* dst, size_t maxDstSize,
-                        const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
-{
-    HUF_WriteCTableWksp wksp;
-    return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp));
-}
-
 
 size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
 {
@@ -231,7 +303,9 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
     if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
     if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
 
-    CTable[0] = tableLog;
+    *maxSymbolValuePtr = nbSymbols - 1;
+
+    HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr);
 
     /* Prepare base value per rank */
     {   U32 n, nextRankStart = 0;
@@ -263,74 +337,71 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void
         { U32 n; for (n=0; n<nbSymbols; n++) HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++); }
     }
 
-    *maxSymbolValuePtr = nbSymbols - 1;
     return readSize;
 }
 
 U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
 {
-    const HUF_CElt* ct = CTable + 1;
+    const HUF_CElt* const ct = CTable + 1;
     assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
+    if (symbolValue > HUF_readCTableHeader(CTable).maxSymbolValue)
+        return 0;
     return (U32)HUF_getNbBits(ct[symbolValue]);
 }
 
 
-typedef struct nodeElt_s {
-    U32 count;
-    U16 parent;
-    BYTE byte;
-    BYTE nbBits;
-} nodeElt;
-
 /*
  * HUF_setMaxHeight():
- * Enforces maxNbBits on the Huffman tree described in huffNode.
+ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode.
  *
- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts
- * the tree to so that it is a valid canonical Huffman tree.
+ * It attempts to convert all nodes with nbBits > @targetNbBits
+ * to employ @targetNbBits instead. Then it adjusts the tree
+ * so that it remains a valid canonical Huffman tree.
  *
  * @pre               The sum of the ranks of each symbol == 2^largestBits,
  *                    where largestBits == huffNode[lastNonNull].nbBits.
  * @post              The sum of the ranks of each symbol == 2^largestBits,
- *                    where largestBits is the return value <= maxNbBits.
+ *                    where largestBits is the return value (expected <= targetNbBits).
  *
- * @param huffNode    The Huffman tree modified in place to enforce maxNbBits.
+ * @param huffNode    The Huffman tree modified in place to enforce targetNbBits.
+ *                    It's presumed sorted, from most frequent to rarest symbol.
  * @param lastNonNull The symbol with the lowest count in the Huffman tree.
- * @param maxNbBits   The maximum allowed number of bits, which the Huffman tree
+ * @param targetNbBits  The allowed number of bits, which the Huffman tree
  *                    may not respect. After this function the Huffman tree will
- *                    respect maxNbBits.
- * @return            The maximum number of bits of the Huffman tree after adjustment,
- *                    necessarily no more than maxNbBits.
+ *                    respect targetNbBits.
+ * @return            The maximum number of bits of the Huffman tree after adjustment.
  */
-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits)
 {
     const U32 largestBits = huffNode[lastNonNull].nbBits;
-    /* early exit : no elt > maxNbBits, so the tree is already valid. */
-    if (largestBits <= maxNbBits) return largestBits;
+    /* early exit : no elt > targetNbBits, so the tree is already valid. */
+    if (largestBits <= targetNbBits) return largestBits;
+
+    DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits);
 
     /* there are several too large elements (at least >= 2) */
     {   int totalCost = 0;
-        const U32 baseCost = 1 << (largestBits - maxNbBits);
+        const U32 baseCost = 1 << (largestBits - targetNbBits);
         int n = (int)lastNonNull;
 
-        /* Adjust any ranks > maxNbBits to maxNbBits.
+        /* Adjust any ranks > targetNbBits to targetNbBits.
          * Compute totalCost, which is how far the sum of the ranks is
          * we are over 2^largestBits after adjust the offending ranks.
          */
-        while (huffNode[n].nbBits > maxNbBits) {
+        while (huffNode[n].nbBits > targetNbBits) {
             totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
-            huffNode[n].nbBits = (BYTE)maxNbBits;
+            huffNode[n].nbBits = (BYTE)targetNbBits;
             n--;
         }
-        /* n stops at huffNode[n].nbBits <= maxNbBits */
-        assert(huffNode[n].nbBits <= maxNbBits);
-        /* n end at index of smallest symbol using < maxNbBits */
-        while (huffNode[n].nbBits == maxNbBits) --n;
+        /* n stops at huffNode[n].nbBits <= targetNbBits */
+        assert(huffNode[n].nbBits <= targetNbBits);
+        /* n end at index of smallest symbol using < targetNbBits */
+        while (huffNode[n].nbBits == targetNbBits) --n;
 
-        /* renorm totalCost from 2^largestBits to 2^maxNbBits
+        /* renorm totalCost from 2^largestBits to 2^targetNbBits
          * note : totalCost is necessarily a multiple of baseCost */
-        assert((totalCost & (baseCost - 1)) == 0);
-        totalCost >>= (largestBits - maxNbBits);
+        assert(((U32)totalCost & (baseCost - 1)) == 0);
+        totalCost >>= (largestBits - targetNbBits);
         assert(totalCost > 0);
 
         /* repay normalized cost */
@@ -339,19 +410,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
 
             /* Get pos of last (smallest = lowest cum. count) symbol per rank */
             ZSTD_memset(rankLast, 0xF0, sizeof(rankLast));
-            {   U32 currentNbBits = maxNbBits;
+            {   U32 currentNbBits = targetNbBits;
                 int pos;
                 for (pos=n ; pos >= 0; pos--) {
                     if (huffNode[pos].nbBits >= currentNbBits) continue;
-                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
-                    rankLast[maxNbBits-currentNbBits] = (U32)pos;
+                    currentNbBits = huffNode[pos].nbBits;   /* < targetNbBits */
+                    rankLast[targetNbBits-currentNbBits] = (U32)pos;
             }   }
 
             while (totalCost > 0) {
                 /* Try to reduce the next power of 2 above totalCost because we
                  * gain back half the rank.
                  */
-                U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
+                U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1;
                 for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
                     U32 const highPos = rankLast[nBitsToDecrease];
                     U32 const lowPos = rankLast[nBitsToDecrease-1];
@@ -391,7 +462,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
                     rankLast[nBitsToDecrease] = noSymbol;
                 else {
                     rankLast[nBitsToDecrease]--;
-                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
+                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease)
                         rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
                 }
             }   /* while (totalCost > 0) */
@@ -403,11 +474,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
              * TODO.
              */
             while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
-                /* special case : no rank 1 symbol (using maxNbBits-1);
-                 * let's create one from largest rank 0 (using maxNbBits).
+                /* special case : no rank 1 symbol (using targetNbBits-1);
+                 * let's create one from largest rank 0 (using targetNbBits).
                  */
                 if (rankLast[1] == noSymbol) {
-                    while (huffNode[n].nbBits == maxNbBits) n--;
+                    while (huffNode[n].nbBits == targetNbBits) n--;
                     huffNode[n+1].nbBits--;
                     assert(n >= 0);
                     rankLast[1] = (U32)(n+1);
@@ -421,7 +492,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
         }   /* repay normalized cost */
     }   /* there are several too large elements (at least >= 2) */
 
-    return maxNbBits;
+    return targetNbBits;
 }
 
 typedef struct {
@@ -429,7 +500,7 @@ typedef struct {
     U16 curr;
 } rankPos;
 
-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
+typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)];
 
 /* Number of buckets available for HUF_sort() */
 #define RANK_POSITION_TABLE_SIZE 192
@@ -448,8 +519,8 @@ typedef struct {
  * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
  */
 #define RANK_POSITION_MAX_COUNT_LOG 32
-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */
+#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */)
+#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */)
 
 /* Return the appropriate bucket index for a given count. See definition of
  * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
@@ -457,7 +528,7 @@ typedef struct {
 static U32 HUF_getIndex(U32 const count) {
     return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
         ? count
-        : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+        : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
 }
 
 /* Helper swap function for HUF_quickSortPartition() */
@@ -580,7 +651,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
 
     /* Sort each bucket. */
     for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) {
-        U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base;
+        int const bucketSize = rankPosition[n].curr - rankPosition[n].base;
         U32 const bucketStartIdx = rankPosition[n].base;
         if (bucketSize > 1) {
             assert(bucketStartIdx < maxSymbolValue1);
@@ -591,6 +662,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy
     assert(HUF_isSorted(huffNode, maxSymbolValue1));
 }
 
+
 /* HUF_buildCTable_wksp() :
  *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
  *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
@@ -611,6 +683,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
     int lowS, lowN;
     int nodeNb = STARTNODE;
     int n, nodeRoot;
+    DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1);
     /* init for parents */
     nonNullRank = (int)maxSymbolValue;
     while(huffNode[nonNullRank].count == 0) nonNullRank--;
@@ -637,6 +710,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue)
     for (n=0; n<=nonNullRank; n++)
         huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
 
+    DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1));
+
     return nonNullRank;
 }
 
@@ -671,31 +746,40 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i
         HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits);   /* push nbBits per symbol, symbol order */
     for (n=0; n<alphabetSize; n++)
         HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++);   /* assign value within rank, symbol order */
-    CTable[0] = maxNbBits;
+
+    HUF_writeCTableHeader(CTable, maxNbBits, maxSymbolValue);
 }
 
-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
+size_t
+HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+                     void* workSpace, size_t wkspSize)
 {
-    HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
+    HUF_buildCTable_wksp_tables* const wksp_tables =
+        (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
     nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
     nodeElt* const huffNode = huffNode0+1;
     int nonNullRank;
 
+    HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables));
+
+    DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1);
+
     /* safety checks */
     if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
-      return ERROR(workSpace_tooSmall);
+        return ERROR(workSpace_tooSmall);
     if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
     if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
-      return ERROR(maxSymbolValue_tooLarge);
+        return ERROR(maxSymbolValue_tooLarge);
     ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable));
 
     /* sort, decreasing order */
     HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
+    DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1));
 
     /* build tree */
     nonNullRank = HUF_buildTree(huffNode, maxSymbolValue);
 
-    /* enforce maxTableLog */
+    /* determine and enforce maxTableLog */
     maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
     if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
 
@@ -716,13 +800,20 @@ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count,
 }
 
 int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
-  HUF_CElt const* ct = CTable + 1;
-  int bad = 0;
-  int s;
-  for (s = 0; s <= (int)maxSymbolValue; ++s) {
-    bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
-  }
-  return !bad;
+    HUF_CTableHeader header = HUF_readCTableHeader(CTable);
+    HUF_CElt const* ct = CTable + 1;
+    int bad = 0;
+    int s;
+
+    assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX);
+
+    if (header.maxSymbolValue < maxSymbolValue)
+        return 0;
+
+    for (s = 0; s <= (int)maxSymbolValue; ++s) {
+        bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0);
+    }
+    return !bad;
 }
 
 size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
@@ -804,7 +895,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id
 #if DEBUGLEVEL >= 1
     {
         size_t const nbBits = HUF_getNbBits(elt);
-        size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1;
+        size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1;
         (void)dirtyBits;
         /* Middle bits are 0. */
         assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0);
@@ -884,7 +975,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC)
     {
         size_t const nbBits = bitC->bitPos[0] & 0xFF;
         if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
-        return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
+        return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0);
     }
 }
 
@@ -964,17 +1055,17 @@ HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
                                    const void* src, size_t srcSize,
                                    const HUF_CElt* CTable)
 {
-    U32 const tableLog = (U32)CTable[0];
+    U32 const tableLog = HUF_readCTableHeader(CTable).tableLog;
     HUF_CElt const* ct = CTable + 1;
     const BYTE* ip = (const BYTE*) src;
     BYTE* const ostart = (BYTE*)dst;
     BYTE* const oend = ostart + dstSize;
-    BYTE* op = ostart;
     HUF_CStream_t bitC;
 
     /* init */
     if (dstSize < 8) return 0;   /* not enough space to compress */
-    { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
+    { BYTE* op = ostart;
+      size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op));
       if (HUF_isError(initErr)) return 0; }
 
     if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11)
@@ -1045,9 +1136,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
 static size_t
 HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
                               const void* src, size_t srcSize,
-                              const HUF_CElt* CTable, const int bmi2)
+                              const HUF_CElt* CTable, const int flags)
 {
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
         return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
     }
     return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
@@ -1058,28 +1149,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
 static size_t
 HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
                               const void* src, size_t srcSize,
-                              const HUF_CElt* CTable, const int bmi2)
+                              const HUF_CElt* CTable, const int flags)
 {
-    (void)bmi2;
+    (void)flags;
     return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
 }
 
 #endif
 
-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
 {
-    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
-}
-
-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
-{
-    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
+    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
 }
 
 static size_t
 HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
                               const void* src, size_t srcSize,
-                              const HUF_CElt* CTable, int bmi2)
+                              const HUF_CElt* CTable, int flags)
 {
     size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
     const BYTE* ip = (const BYTE*) src;
@@ -1093,7 +1179,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
     op += 6;   /* jumpTable */
 
     assert(op <= oend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart, (U16)cSize);
         op += cSize;
@@ -1101,7 +1187,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
 
     ip += segmentSize;
     assert(op <= oend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart+2, (U16)cSize);
         op += cSize;
@@ -1109,7 +1195,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
 
     ip += segmentSize;
     assert(op <= oend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart+4, (U16)cSize);
         op += cSize;
@@ -1118,7 +1204,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
     ip += segmentSize;
     assert(op <= oend);
     assert(ip <= iend);
-    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) );
         if (cSize == 0 || cSize > 65535) return 0;
         op += cSize;
     }
@@ -1126,14 +1212,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
     return (size_t)(op-ostart);
 }
 
-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
-{
-    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
-}
-
-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags)
 {
-    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2);
+    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags);
 }
 
 typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
@@ -1141,11 +1222,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
 static size_t HUF_compressCTable_internal(
                 BYTE* const ostart, BYTE* op, BYTE* const oend,
                 const void* src, size_t srcSize,
-                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
+                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags)
 {
     size_t const cSize = (nbStreams==HUF_singleStream) ?
-                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
-                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
+                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) :
+                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags);
     if (HUF_isError(cSize)) { return cSize; }
     if (cSize==0) { return 0; }   /* uncompressible */
     op += cSize;
@@ -1168,6 +1249,81 @@ typedef struct {
 #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
 #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >= 2 */
 
+unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue)
+{
+    unsigned cardinality = 0;
+    unsigned i;
+
+    for (i = 0; i < maxSymbolValue + 1; i++) {
+        if (count[i] != 0) cardinality += 1;
+    }
+
+    return cardinality;
+}
+
+unsigned HUF_minTableLog(unsigned symbolCardinality)
+{
+    U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1;
+    return minBitsSymbols;
+}
+
+unsigned HUF_optimalTableLog(
+            unsigned maxTableLog,
+            size_t srcSize,
+            unsigned maxSymbolValue,
+            void* workSpace, size_t wkspSize,
+            HUF_CElt* table,
+      const unsigned* count,
+            int flags)
+{
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables));
+
+    if (!(flags & HUF_flags_optimalDepth)) {
+        /* cheap evaluation, based on FSE */
+        return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+    }
+
+    {   BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp);
+        size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp);
+        size_t hSize, newSize;
+        const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue);
+        const unsigned minTableLog = HUF_minTableLog(symbolCardinality);
+        size_t optSize = ((size_t) ~0) - 1;
+        unsigned optLog = maxTableLog, optLogGuess;
+
+        DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize);
+
+        /* Search until size increases */
+        for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) {
+            DEBUGLOG(7, "checking for huffLog=%u", optLogGuess);
+
+            {   size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize);
+                if (ERR_isError(maxBits)) continue;
+
+                if (maxBits < optLogGuess && optLogGuess > minTableLog) break;
+
+                hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize);
+            }
+
+            if (ERR_isError(hSize)) continue;
+
+            newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize;
+
+            if (newSize > optSize + 1) {
+                break;
+            }
+
+            if (newSize < optSize) {
+                optSize = newSize;
+                optLog = optLogGuess;
+            }
+        }
+        assert(optLog <= HUF_TABLELOG_MAX);
+        return optLog;
+    }
+}
+
 /* HUF_compress_internal() :
  * `workSpace_align4` must be aligned on 4-bytes boundaries,
  * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */
@@ -1177,14 +1333,14 @@ HUF_compress_internal (void* dst, size_t dstSize,
                        unsigned maxSymbolValue, unsigned huffLog,
                        HUF_nbStreams_e nbStreams,
                        void* workSpace, size_t wkspSize,
-                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
-                 const int bmi2, unsigned suspectUncompressible)
+                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags)
 {
     HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
     BYTE* const ostart = (BYTE*)dst;
     BYTE* const oend = ostart + dstSize;
     BYTE* op = ostart;
 
+    DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize);
     HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE);
 
     /* checks & inits */
@@ -1198,16 +1354,17 @@ HUF_compress_internal (void* dst, size_t dstSize,
     if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
 
     /* Heuristic : If old table is valid, use it for small inputs */
-    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
+    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) {
         return HUF_compressCTable_internal(ostart, op, oend,
                                            src, srcSize,
-                                           nbStreams, oldHufTable, bmi2);
+                                           nbStreams, oldHufTable, flags);
     }
 
     /* If uncompressible data is suspected, do a smaller sampling first */
     DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2);
-    if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+    if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
         size_t largestTotal = 0;
+        DEBUGLOG(5, "input suspected incompressible : sampling to check");
         {   unsigned maxSymbolValueBegin = maxSymbolValue;
             CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
             largestTotal += largestBegin;
@@ -1224,6 +1381,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
         if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
     }
+    DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1));
 
     /* Check validity of previous table */
     if ( repeat
@@ -1232,25 +1390,20 @@ HUF_compress_internal (void* dst, size_t dstSize,
         *repeat = HUF_repeat_none;
     }
     /* Heuristic : use existing table for small inputs */
-    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
+    if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) {
         return HUF_compressCTable_internal(ostart, op, oend,
                                            src, srcSize,
-                                           nbStreams, oldHufTable, bmi2);
+                                           nbStreams, oldHufTable, flags);
     }
 
     /* Build Huffman Tree */
-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags);
     {   size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
                                             maxSymbolValue, huffLog,
                                             &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp));
         CHECK_F(maxBits);
         huffLog = (U32)maxBits;
-    }
-    /* Zero unused symbols in CTable, so we can check it for validity */
-    {
-        size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue);
-        size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt);
-        ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
+        DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1));
     }
 
     /* Write table description header */
@@ -1263,7 +1416,7 @@ HUF_compress_internal (void* dst, size_t dstSize,
             if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
                 return HUF_compressCTable_internal(ostart, op, oend,
                                                    src, srcSize,
-                                                   nbStreams, oldHufTable, bmi2);
+                                                   nbStreams, oldHufTable, flags);
         }   }
 
         /* Use the new huffman table */
@@ -1275,61 +1428,35 @@ HUF_compress_internal (void* dst, size_t dstSize,
     }
     return HUF_compressCTable_internal(ostart, op, oend,
                                        src, srcSize,
-                                       nbStreams, table->CTable, bmi2);
-}
-
-
-size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
-                      const void* src, size_t srcSize,
-                      unsigned maxSymbolValue, unsigned huffLog,
-                      void* workSpace, size_t wkspSize)
-{
-    return HUF_compress_internal(dst, dstSize, src, srcSize,
-                                 maxSymbolValue, huffLog, HUF_singleStream,
-                                 workSpace, wkspSize,
-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
+                                       nbStreams, table->CTable, flags);
 }
 
 size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
                       const void* src, size_t srcSize,
                       unsigned maxSymbolValue, unsigned huffLog,
                       void* workSpace, size_t wkspSize,
-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat,
-                      int bmi2, unsigned suspectUncompressible)
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
 {
+    DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize);
     return HUF_compress_internal(dst, dstSize, src, srcSize,
                                  maxSymbolValue, huffLog, HUF_singleStream,
                                  workSpace, wkspSize, hufTable,
-                                 repeat, preferRepeat, bmi2, suspectUncompressible);
-}
-
-/* HUF_compress4X_repeat():
- * compress input using 4 streams.
- * provide workspace to generate compression tables */
-size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
-                      const void* src, size_t srcSize,
-                      unsigned maxSymbolValue, unsigned huffLog,
-                      void* workSpace, size_t wkspSize)
-{
-    return HUF_compress_internal(dst, dstSize, src, srcSize,
-                                 maxSymbolValue, huffLog, HUF_fourStreams,
-                                 workSpace, wkspSize,
-                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
+                                 repeat, flags);
 }
 
 /* HUF_compress4X_repeat():
  * compress input using 4 streams.
  * consider skipping quickly
- * re-use an existing huffman compression table */
+ * reuse an existing huffman compression table */
 size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
                       const void* src, size_t srcSize,
                       unsigned maxSymbolValue, unsigned huffLog,
                       void* workSpace, size_t wkspSize,
-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible)
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int flags)
 {
+    DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize);
     return HUF_compress_internal(dst, dstSize, src, srcSize,
                                  maxSymbolValue, huffLog, HUF_fourStreams,
                                  workSpace, wkspSize,
-                                 hufTable, repeat, preferRepeat, bmi2, suspectUncompressible);
+                                 hufTable, repeat, flags);
 }
-
diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
index f620cafca633..0d139727cd39 100644
--- a/lib/zstd/compress/zstd_compress.c
+++ b/lib/zstd/compress/zstd_compress.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,12 +12,12 @@
 /*-*************************************
 *  Dependencies
 ***************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
 #include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
 #include "../common/mem.h"
 #include "hist.h"           /* HIST_countFast_wksp */
 #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "zstd_compress_internal.h"
 #include "zstd_compress_sequences.h"
@@ -27,6 +28,7 @@
 #include "zstd_opt.h"
 #include "zstd_ldm.h"
 #include "zstd_compress_superblock.h"
+#include  "../common/bits.h"      /* ZSTD_highbit32, ZSTD_rotateRight_U64 */
 
 /* ***************************************************************
 *  Tuning parameters
@@ -55,14 +57,17 @@
 *  Helper functions
 ***************************************/
 /* ZSTD_compressBound()
- * Note that the result from this function is only compatible with the "normal"
- * full-block strategy.
- * When there are a lot of small blocks due to frequent flush in streaming mode
- * the overhead of headers can make the compressed data to be larger than the
- * return value of ZSTD_compressBound().
+ * Note that the result from this function is only valid for
+ * the one-pass compression functions.
+ * When employing the streaming mode,
+ * if flushes are frequently altering the size of blocks,
+ * the overhead from block headers can make the compressed data larger
+ * than the return value of ZSTD_compressBound().
  */
 size_t ZSTD_compressBound(size_t srcSize) {
-    return ZSTD_COMPRESSBOUND(srcSize);
+    size_t const r = ZSTD_COMPRESSBOUND(srcSize);
+    if (r==0) return ERROR(srcSize_wrong);
+    return r;
 }
 
 
@@ -168,15 +173,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx)
 
 size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
 {
+    DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx);
     if (cctx==NULL) return 0;   /* support free on NULL */
     RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
                     "not compatible with static CCtx");
-    {
-        int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+    {   int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
         ZSTD_freeCCtxContent(cctx);
-        if (!cctxInWorkspace) {
-            ZSTD_customFree(cctx, cctx->customMem);
-        }
+        if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem);
     }
     return 0;
 }
@@ -257,9 +260,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
     return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder));
 }
 
-/* Returns 1 if compression parameters are such that we should
+/* Returns ZSTD_ps_enable if compression parameters are such that we should
  * enable long distance matching (wlog >= 27, strategy >= btopt).
- * Returns 0 otherwise.
+ * Returns ZSTD_ps_disable otherwise.
  */
 static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
                                  const ZSTD_compressionParameters* const cParams) {
@@ -267,6 +270,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
     return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable;
 }
 
+static int ZSTD_resolveExternalSequenceValidation(int mode) {
+    return mode;
+}
+
+/* Resolves maxBlockSize to the default if no value is present. */
+static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) {
+    if (maxBlockSize == 0) {
+        return ZSTD_BLOCKSIZE_MAX;
+    } else {
+        return maxBlockSize;
+    }
+}
+
+static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) {
+    if (value != ZSTD_ps_auto) return value;
+    if (cLevel < 10) {
+        return ZSTD_ps_disable;
+    } else {
+        return ZSTD_ps_enable;
+    }
+}
+
+/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged.
+ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */
+static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) {
+    return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast;
+}
+
 static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
         ZSTD_compressionParameters cParams)
 {
@@ -284,6 +315,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
     }
     cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams);
     cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams);
+    cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences);
+    cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize);
+    cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes,
+                                                                             cctxParams.compressionLevel);
     assert(!ZSTD_checkCParams(cParams));
     return cctxParams;
 }
@@ -329,10 +364,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel)
 #define ZSTD_NO_CLEVEL 0
 
 /*
- * Initializes the cctxParams from params and compressionLevel.
+ * Initializes `cctxParams` from `params` and `compressionLevel`.
  * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL.
  */
-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel)
+static void
+ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams,
+                        const ZSTD_parameters* params,
+                              int compressionLevel)
 {
     assert(!ZSTD_checkCParams(params->cParams));
     ZSTD_memset(cctxParams, 0, sizeof(*cctxParams));
@@ -345,6 +383,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par
     cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, &params->cParams);
     cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, &params->cParams);
     cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, &params->cParams);
+    cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences);
+    cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize);
+    cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel);
     DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d",
                 cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm);
 }
@@ -359,7 +400,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete
 
 /*
  * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone.
- * @param param Validated zstd parameters.
+ * @param params Validated zstd parameters.
  */
 static void ZSTD_CCtxParams_setZstdParams(
         ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
@@ -455,8 +496,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
         return bounds;
 
     case ZSTD_c_enableLongDistanceMatching:
-        bounds.lowerBound = 0;
-        bounds.upperBound = 1;
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
         return bounds;
 
     case ZSTD_c_ldmHashLog:
@@ -549,6 +590,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
         bounds.upperBound = 1;
         return bounds;
 
+    case ZSTD_c_prefetchCDictTables:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
+    case ZSTD_c_enableSeqProducerFallback:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_maxBlockSize:
+        bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
+        bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
+        return bounds;
+
+    case ZSTD_c_searchForExternalRepcodes:
+        bounds.lowerBound = (int)ZSTD_ps_auto;
+        bounds.upperBound = (int)ZSTD_ps_disable;
+        return bounds;
+
     default:
         bounds.error = ERROR(parameter_unsupported);
         return bounds;
@@ -567,10 +628,11 @@ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value)
     return 0;
 }
 
-#define BOUNDCHECK(cParam, val) { \
-    RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \
-                    parameter_outOfBound, "Param out of bounds"); \
-}
+#define BOUNDCHECK(cParam, val)                                       \
+    do {                                                              \
+        RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val),        \
+                        parameter_outOfBound, "Param out of bounds"); \
+    } while (0)
 
 
 static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
@@ -613,6 +675,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
     case ZSTD_c_useBlockSplitter:
     case ZSTD_c_useRowMatchFinder:
     case ZSTD_c_deterministicRefPrefix:
+    case ZSTD_c_prefetchCDictTables:
+    case ZSTD_c_enableSeqProducerFallback:
+    case ZSTD_c_maxBlockSize:
+    case ZSTD_c_searchForExternalRepcodes:
     default:
         return 0;
     }
@@ -625,7 +691,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
         if (ZSTD_isUpdateAuthorized(param)) {
             cctx->cParamsChanged = 1;
         } else {
-            RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
+            RETURN_ERROR(stage_wrong, "can only set params in cctx init stage");
     }   }
 
     switch(param)
@@ -668,6 +734,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
     case ZSTD_c_useBlockSplitter:
     case ZSTD_c_useRowMatchFinder:
     case ZSTD_c_deterministicRefPrefix:
+    case ZSTD_c_prefetchCDictTables:
+    case ZSTD_c_enableSeqProducerFallback:
+    case ZSTD_c_maxBlockSize:
+    case ZSTD_c_searchForExternalRepcodes:
         break;
 
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
@@ -723,12 +793,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
     case ZSTD_c_minMatch :
         if (value!=0)   /* 0 => use default */
             BOUNDCHECK(ZSTD_c_minMatch, value);
-        CCtxParams->cParams.minMatch = value;
+        CCtxParams->cParams.minMatch = (U32)value;
         return CCtxParams->cParams.minMatch;
 
     case ZSTD_c_targetLength :
         BOUNDCHECK(ZSTD_c_targetLength, value);
-        CCtxParams->cParams.targetLength = value;
+        CCtxParams->cParams.targetLength = (U32)value;
         return CCtxParams->cParams.targetLength;
 
     case ZSTD_c_strategy :
@@ -741,12 +811,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
         /* Content size written in frame header _when known_ (default:1) */
         DEBUGLOG(4, "set content size flag = %u", (value!=0));
         CCtxParams->fParams.contentSizeFlag = value != 0;
-        return CCtxParams->fParams.contentSizeFlag;
+        return (size_t)CCtxParams->fParams.contentSizeFlag;
 
     case ZSTD_c_checksumFlag :
         /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
         CCtxParams->fParams.checksumFlag = value != 0;
-        return CCtxParams->fParams.checksumFlag;
+        return (size_t)CCtxParams->fParams.checksumFlag;
 
     case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
         DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
@@ -755,18 +825,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
 
     case ZSTD_c_forceMaxWindow :
         CCtxParams->forceWindow = (value != 0);
-        return CCtxParams->forceWindow;
+        return (size_t)CCtxParams->forceWindow;
 
     case ZSTD_c_forceAttachDict : {
         const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
-        BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
+        BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref);
         CCtxParams->attachDictPref = pref;
         return CCtxParams->attachDictPref;
     }
 
     case ZSTD_c_literalCompressionMode : {
         const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value;
-        BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
+        BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm);
         CCtxParams->literalCompressionMode = lcm;
         return CCtxParams->literalCompressionMode;
     }
@@ -789,47 +859,50 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
 
     case ZSTD_c_enableDedicatedDictSearch :
         CCtxParams->enableDedicatedDictSearch = (value!=0);
-        return CCtxParams->enableDedicatedDictSearch;
+        return (size_t)CCtxParams->enableDedicatedDictSearch;
 
     case ZSTD_c_enableLongDistanceMatching :
+        BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value);
         CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value;
         return CCtxParams->ldmParams.enableLdm;
 
     case ZSTD_c_ldmHashLog :
         if (value!=0)   /* 0 ==> auto */
             BOUNDCHECK(ZSTD_c_ldmHashLog, value);
-        CCtxParams->ldmParams.hashLog = value;
+        CCtxParams->ldmParams.hashLog = (U32)value;
         return CCtxParams->ldmParams.hashLog;
 
     case ZSTD_c_ldmMinMatch :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
-        CCtxParams->ldmParams.minMatchLength = value;
+        CCtxParams->ldmParams.minMatchLength = (U32)value;
         return CCtxParams->ldmParams.minMatchLength;
 
     case ZSTD_c_ldmBucketSizeLog :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
-        CCtxParams->ldmParams.bucketSizeLog = value;
+        CCtxParams->ldmParams.bucketSizeLog = (U32)value;
         return CCtxParams->ldmParams.bucketSizeLog;
 
     case ZSTD_c_ldmHashRateLog :
         if (value!=0)   /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_ldmHashRateLog, value);
-        CCtxParams->ldmParams.hashRateLog = value;
+        CCtxParams->ldmParams.hashRateLog = (U32)value;
         return CCtxParams->ldmParams.hashRateLog;
 
     case ZSTD_c_targetCBlockSize :
-        if (value!=0)   /* 0 ==> default */
+        if (value!=0) {  /* 0 ==> default */
+            value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN);
             BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
-        CCtxParams->targetCBlockSize = value;
+        }
+        CCtxParams->targetCBlockSize = (U32)value;
         return CCtxParams->targetCBlockSize;
 
     case ZSTD_c_srcSizeHint :
         if (value!=0)    /* 0 ==> default */
             BOUNDCHECK(ZSTD_c_srcSizeHint, value);
         CCtxParams->srcSizeHint = value;
-        return CCtxParams->srcSizeHint;
+        return (size_t)CCtxParams->srcSizeHint;
 
     case ZSTD_c_stableInBuffer:
         BOUNDCHECK(ZSTD_c_stableInBuffer, value);
@@ -849,7 +922,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
     case ZSTD_c_validateSequences:
         BOUNDCHECK(ZSTD_c_validateSequences, value);
         CCtxParams->validateSequences = value;
-        return CCtxParams->validateSequences;
+        return (size_t)CCtxParams->validateSequences;
 
     case ZSTD_c_useBlockSplitter:
         BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
@@ -864,7 +937,28 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
     case ZSTD_c_deterministicRefPrefix:
         BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value);
         CCtxParams->deterministicRefPrefix = !!value;
-        return CCtxParams->deterministicRefPrefix;
+        return (size_t)CCtxParams->deterministicRefPrefix;
+
+    case ZSTD_c_prefetchCDictTables:
+        BOUNDCHECK(ZSTD_c_prefetchCDictTables, value);
+        CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->prefetchCDictTables;
+
+    case ZSTD_c_enableSeqProducerFallback:
+        BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value);
+        CCtxParams->enableMatchFinderFallback = value;
+        return (size_t)CCtxParams->enableMatchFinderFallback;
+
+    case ZSTD_c_maxBlockSize:
+        if (value!=0)    /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_maxBlockSize, value);
+        CCtxParams->maxBlockSize = value;
+        return CCtxParams->maxBlockSize;
+
+    case ZSTD_c_searchForExternalRepcodes:
+        BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value);
+        CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value;
+        return CCtxParams->searchForExternalRepcodes;
 
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
     }
@@ -980,6 +1074,18 @@ size_t ZSTD_CCtxParams_getParameter(
     case ZSTD_c_deterministicRefPrefix:
         *value = (int)CCtxParams->deterministicRefPrefix;
         break;
+    case ZSTD_c_prefetchCDictTables:
+        *value = (int)CCtxParams->prefetchCDictTables;
+        break;
+    case ZSTD_c_enableSeqProducerFallback:
+        *value = CCtxParams->enableMatchFinderFallback;
+        break;
+    case ZSTD_c_maxBlockSize:
+        *value = (int)CCtxParams->maxBlockSize;
+        break;
+    case ZSTD_c_searchForExternalRepcodes:
+        *value = (int)CCtxParams->searchForExternalRepcodes;
+        break;
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
     }
     return 0;
@@ -1006,9 +1112,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
     return 0;
 }
 
+size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams)
+{
+    ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */);
+    DEBUGLOG(4, "ZSTD_CCtx_setCParams");
+    /* only update if all parameters are valid */
+    FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), "");
+    return 0;
+}
+
+size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams)
+{
+    ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */);
+    DEBUGLOG(4, "ZSTD_CCtx_setFParams");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), "");
+    FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), "");
+    return 0;
+}
+
+size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setParams");
+    /* First check cParams, because we want to update all or none. */
+    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
+    /* Next set fParams, because this could fail if the cctx isn't in init stage. */
+    FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), "");
+    /* Finally set cParams, which should succeed. */
+    FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), "");
+    return 0;
+}
+
 size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
 {
-    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
+    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize);
     RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
                     "Can't set pledgedSrcSize when not in init stage.");
     cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
@@ -1024,9 +1168,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
         ZSTD_compressionParameters* cParams);
 
 /*
- * Initializes the local dict using the requested parameters.
- * NOTE: This does not use the pledged src size, because it may be used for more
- * than one compression.
+ * Initializes the local dictionary using requested parameters.
+ * NOTE: Initialization does not employ the pledged src size,
+ * because the dictionary may be used for multiple compressions.
  */
 static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
 {
@@ -1039,8 +1183,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
         return 0;
     }
     if (dl->cdict != NULL) {
-        assert(cctx->cdict == dl->cdict);
         /* Local dictionary already initialized. */
+        assert(cctx->cdict == dl->cdict);
         return 0;
     }
     assert(dl->dictSize > 0);
@@ -1060,26 +1204,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
 }
 
 size_t ZSTD_CCtx_loadDictionary_advanced(
-        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
-        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+        ZSTD_CCtx* cctx,
+        const void* dict, size_t dictSize,
+        ZSTD_dictLoadMethod_e dictLoadMethod,
+        ZSTD_dictContentType_e dictContentType)
 {
-    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-                    "Can't load a dictionary when ctx is not in init stage.");
     DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
-    ZSTD_clearAllDicts(cctx);  /* in case one already exists */
-    if (dict == NULL || dictSize == 0)  /* no dictionary mode */
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't load a dictionary when cctx is not in init stage.");
+    ZSTD_clearAllDicts(cctx);  /* erase any previously set dictionary */
+    if (dict == NULL || dictSize == 0)  /* no dictionary */
         return 0;
     if (dictLoadMethod == ZSTD_dlm_byRef) {
         cctx->localDict.dict = dict;
     } else {
+        /* copy dictionary content inside CCtx to own its lifetime */
         void* dictBuffer;
         RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
-                        "no malloc for static CCtx");
+                        "static CCtx can't allocate for an internal copy of dictionary");
         dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem);
-        RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
+        RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation,
+                        "allocation failed for dictionary content");
         ZSTD_memcpy(dictBuffer, dict, dictSize);
-        cctx->localDict.dictBuffer = dictBuffer;
-        cctx->localDict.dict = dictBuffer;
+        cctx->localDict.dictBuffer = dictBuffer;  /* owned ptr to free */
+        cctx->localDict.dict = dictBuffer;        /* read-only reference */
     }
     cctx->localDict.dictSize = dictSize;
     cctx->localDict.dictContentType = dictContentType;
@@ -1149,7 +1297,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
     if ( (reset == ZSTD_reset_parameters)
       || (reset == ZSTD_reset_session_and_parameters) ) {
         RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
-                        "Can't reset parameters only when not in init stage.");
+                        "Reset parameters is only possible during init stage.");
         ZSTD_clearAllDicts(cctx);
         return ZSTD_CCtxParams_reset(&cctx->requestedParams);
     }
@@ -1178,11 +1326,12 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
 static ZSTD_compressionParameters
 ZSTD_clampCParams(ZSTD_compressionParameters cParams)
 {
-#   define CLAMP_TYPE(cParam, val, type) {                                \
-        ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
-        if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
-        else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
-    }
+#   define CLAMP_TYPE(cParam, val, type)                                      \
+        do {                                                                  \
+            ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
+            if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
+            else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
+        } while (0)
 #   define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned)
     CLAMP(ZSTD_c_windowLog, cParams.windowLog);
     CLAMP(ZSTD_c_chainLog,  cParams.chainLog);
@@ -1247,12 +1396,55 @@ static ZSTD_compressionParameters
 ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
                             unsigned long long srcSize,
                             size_t dictSize,
-                            ZSTD_cParamMode_e mode)
+                            ZSTD_cParamMode_e mode,
+                            ZSTD_paramSwitch_e useRowMatchFinder)
 {
     const U64 minSrcSize = 513; /* (1<<9) + 1 */
     const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
     assert(ZSTD_checkCParams(cPar)==0);
 
+    /* Cascade the selected strategy down to the next-highest one built into
+     * this binary. */
+#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+    if (cPar.strategy == ZSTD_btultra2) {
+        cPar.strategy = ZSTD_btultra;
+    }
+    if (cPar.strategy == ZSTD_btultra) {
+        cPar.strategy = ZSTD_btopt;
+    }
+#endif
+#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
+    if (cPar.strategy == ZSTD_btopt) {
+        cPar.strategy = ZSTD_btlazy2;
+    }
+#endif
+#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
+    if (cPar.strategy == ZSTD_btlazy2) {
+        cPar.strategy = ZSTD_lazy2;
+    }
+#endif
+#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
+    if (cPar.strategy == ZSTD_lazy2) {
+        cPar.strategy = ZSTD_lazy;
+    }
+#endif
+#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
+    if (cPar.strategy == ZSTD_lazy) {
+        cPar.strategy = ZSTD_greedy;
+    }
+#endif
+#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
+    if (cPar.strategy == ZSTD_greedy) {
+        cPar.strategy = ZSTD_dfast;
+    }
+#endif
+#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+    if (cPar.strategy == ZSTD_dfast) {
+        cPar.strategy = ZSTD_fast;
+        cPar.targetLength = 0;
+    }
+#endif
+
     switch (mode) {
     case ZSTD_cpm_unknown:
     case ZSTD_cpm_noAttachDict:
@@ -1281,8 +1473,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
     }
 
     /* resize windowLog if input is small enough, to use less memory */
-    if ( (srcSize < maxWindowResize)
-      && (dictSize < maxWindowResize) )  {
+    if ( (srcSize <= maxWindowResize)
+      && (dictSize <= maxWindowResize) )  {
         U32 const tSize = (U32)(srcSize + dictSize);
         static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
         U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
@@ -1300,6 +1492,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
     if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
         cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
 
+    /* We can't use more than 32 bits of hash in total, so that means that we require:
+     * (hashLog + 8) <= 32 && (chainLog + 8) <= 32
+     */
+    if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) {
+        U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS;
+        if (cPar.hashLog > maxShortCacheHashLog) {
+            cPar.hashLog = maxShortCacheHashLog;
+        }
+        if (cPar.chainLog > maxShortCacheHashLog) {
+            cPar.chainLog = maxShortCacheHashLog;
+        }
+    }
+
+
+    /* At this point, we aren't 100% sure if we are using the row match finder.
+     * Unless it is explicitly disabled, conservatively assume that it is enabled.
+     * In this case it will only be disabled for small sources, so shrinking the
+     * hash log a little bit shouldn't result in any ratio loss.
+     */
+    if (useRowMatchFinder == ZSTD_ps_auto)
+        useRowMatchFinder = ZSTD_ps_enable;
+
+    /* We can't hash more than 32-bits in total. So that means that we require:
+     * (hashLog - rowLog + 8) <= 32
+     */
+    if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) {
+        /* Switch to 32-entry rows if searchLog is 5 (or more) */
+        U32 const rowLog = BOUNDED(4, cPar.searchLog, 6);
+        U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS;
+        U32 const maxHashLog = maxRowHashLog + rowLog;
+        assert(cPar.hashLog >= rowLog);
+        if (cPar.hashLog > maxHashLog) {
+            cPar.hashLog = maxHashLog;
+        }
+    }
+
     return cPar;
 }
 
@@ -1310,7 +1538,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
 {
     cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
     if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
-    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown);
+    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto);
 }
 
 static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode);
@@ -1341,7 +1569,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
     ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
     assert(!ZSTD_checkCParams(cParams));
     /* srcSizeHint == 0 means 0 */
-    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode);
+    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder);
 }
 
 static size_t
@@ -1367,10 +1595,10 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
       + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32))
       + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32))
       + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32))
-      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
-      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+      + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_match_t))
+      + ZSTD_cwksp_aligned_alloc_size(ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
     size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)
-                                            ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16))
+                                            ? ZSTD_cwksp_aligned_alloc_size(hSize)
                                             : 0;
     size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
                                 ? optPotentialSpace
@@ -1386,6 +1614,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
     return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
 }
 
+/* Helper function for calculating memory requirements.
+ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */
+static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) {
+    U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4;
+    return blockSize / divider;
+}
+
 static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
         const ZSTD_compressionParameters* cParams,
         const ldmParams_t* ldmParams,
@@ -1393,12 +1628,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
         const ZSTD_paramSwitch_e useRowMatchFinder,
         const size_t buffInSize,
         const size_t buffOutSize,
-        const U64 pledgedSrcSize)
+        const U64 pledgedSrcSize,
+        int useSequenceProducer,
+        size_t maxBlockSize)
 {
     size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize);
-    size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
-    U32    const divider = (cParams->minMatch==3) ? 3 : 4;
-    size_t const maxNbSeq = blockSize / divider;
+    size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize);
+    size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer);
     size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
                             + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef))
                             + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
@@ -1417,6 +1653,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
 
     size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
 
+    size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
+    size_t const externalSeqSpace = useSequenceProducer
+        ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence))
+        : 0;
+
     size_t const neededSpace =
         cctxSpace +
         entropySpace +
@@ -1425,7 +1666,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
         ldmSeqSpace +
         matchStateSize +
         tokenSpace +
-        bufferSpace;
+        bufferSpace +
+        externalSeqSpace;
 
     DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
     return neededSpace;
@@ -1443,7 +1685,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
      * be needed. However, we still allocate two 0-sized buffers, which can
      * take space under ASAN. */
     return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
-        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
+        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
 }
 
 size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
@@ -1493,7 +1735,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
     {   ZSTD_compressionParameters const cParams =
                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict);
-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
+        size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog);
         size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered)
                 ? ((size_t)1 << cParams.windowLog) + blockSize
                 : 0;
@@ -1504,7 +1746,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
 
         return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
             &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize,
-            ZSTD_CONTENTSIZE_UNKNOWN);
+            ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
     }
 }
 
@@ -1637,6 +1879,19 @@ typedef enum {
     ZSTD_resetTarget_CCtx
 } ZSTD_resetTarget_e;
 
+/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */
+static U64 ZSTD_bitmix(U64 val, U64 len) {
+    val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24);
+    val *= 0x9FB21C651E98DF25ULL;
+    val ^= (val >> 35) + len ;
+    val *= 0x9FB21C651E98DF25ULL;
+    return val ^ (val >> 28);
+}
+
+/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */
+static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) {
+    ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4);
+}
 
 static size_t
 ZSTD_reset_matchState(ZSTD_matchState_t* ms,
@@ -1664,6 +1919,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
     }
 
     ms->hashLog3 = hashLog3;
+    ms->lazySkipping = 0;
 
     ZSTD_invalidateMatchState(ms);
 
@@ -1685,22 +1941,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
         ZSTD_cwksp_clean_tables(ws);
     }
 
-    /* opt parser space */
-    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
-        DEBUGLOG(4, "reserving optimal parser space");
-        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
-        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
-        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
-        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
-        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t));
-        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
-    }
-
     if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
-        {   /* Row match finder needs an additional table of hashes ("tags") */
-            size_t const tagTableSize = hSize*sizeof(U16);
-            ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
-            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
+        /* Row match finder needs an additional table of hashes ("tags") */
+        size_t const tagTableSize = hSize;
+        /* We want to generate a new salt in case we reset a Cctx, but we always want to use
+         * 0 when we reset a Cdict */
+        if(forWho == ZSTD_resetTarget_CCtx) {
+            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize);
+            ZSTD_advanceHashSalt(ms);
+        } else {
+            /* When we are not salting we want to always memset the memory */
+            ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize);
+            ZSTD_memset(ms->tagTable, 0, tagTableSize);
+            ms->hashSalt = 0;
         }
         {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
             U32 const rowLog = BOUNDED(4, cParams->searchLog, 6);
@@ -1709,6 +1962,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
         }
     }
 
+    /* opt parser space */
+    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
+        DEBUGLOG(4, "reserving optimal parser space");
+        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
+        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
+        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
+        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
+        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t));
+        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t));
+    }
+
     ms->cParams = *cParams;
 
     RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
@@ -1768,6 +2032,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
     assert(params->useRowMatchFinder != ZSTD_ps_auto);
     assert(params->useBlockSplitter != ZSTD_ps_auto);
     assert(params->ldmParams.enableLdm != ZSTD_ps_auto);
+    assert(params->maxBlockSize != 0);
     if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
         /* Adjust long distance matching parameters */
         ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->cParams);
@@ -1776,9 +2041,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
     }
 
     {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize));
-        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
-        U32    const divider = (params->cParams.minMatch==3) ? 3 : 4;
-        size_t const maxNbSeq = blockSize / divider;
+        size_t const blockSize = MIN(params->maxBlockSize, windowSize);
+        size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params));
         size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered)
                 ? ZSTD_compressBound(blockSize) + 1
                 : 0;
@@ -1795,8 +2059,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
         size_t const neededSpace =
             ZSTD_estimateCCtxSize_usingCCtxParams_internal(
                 &params->cParams, &params->ldmParams, zc->staticSize != 0, params->useRowMatchFinder,
-                buffInSize, buffOutSize, pledgedSrcSize);
-        int resizeWorkspace;
+                buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize);
 
         FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
 
@@ -1805,7 +2068,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
         {   /* Check if workspace is large enough, alloc a new one if needed */
             int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace;
             int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace);
-            resizeWorkspace = workspaceTooSmall || workspaceWasteful;
+            int resizeWorkspace = workspaceTooSmall || workspaceWasteful;
             DEBUGLOG(4, "Need %zu B workspace", neededSpace);
             DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
 
@@ -1838,6 +2101,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
 
         /* init params */
         zc->blockState.matchState.cParams = params->cParams;
+        zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable;
         zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
         zc->consumedSrcSize = 0;
         zc->producedCSize = 0;
@@ -1854,13 +2118,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
 
         ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
 
+        FORWARD_IF_ERROR(ZSTD_reset_matchState(
+                &zc->blockState.matchState,
+                ws,
+                &params->cParams,
+                params->useRowMatchFinder,
+                crp,
+                needsIndexReset,
+                ZSTD_resetTarget_CCtx), "");
+
+        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+
+        /* ldm hash table */
+        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
+            /* TODO: avoid memset? */
+            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
+            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+            zc->maxNbLdmSequences = maxNbLdmSeq;
+
+            ZSTD_window_init(&zc->ldmState.window);
+            zc->ldmState.loadedDictEnd = 0;
+        }
+
+        /* reserve space for block-level external sequences */
+        if (ZSTD_hasExtSeqProd(params)) {
+            size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize);
+            zc->extSeqBufCapacity = maxNbExternalSeq;
+            zc->extSeqBuf =
+                (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence));
+        }
+
+        /* buffers */
+
         /* ZSTD_wildcopy() is used to copy into the literals buffer,
          * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
          */
         zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
         zc->seqStore.maxNbLit = blockSize;
 
-        /* buffers */
         zc->bufferedPolicy = zbuff;
         zc->inBuffSize = buffInSize;
         zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
@@ -1883,32 +2180,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
         zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
         zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
         zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
-        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
-
-        FORWARD_IF_ERROR(ZSTD_reset_matchState(
-            &zc->blockState.matchState,
-            ws,
-            &params->cParams,
-            params->useRowMatchFinder,
-            crp,
-            needsIndexReset,
-            ZSTD_resetTarget_CCtx), "");
-
-        /* ldm hash table */
-        if (params->ldmParams.enableLdm == ZSTD_ps_enable) {
-            /* TODO: avoid memset? */
-            size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog;
-            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
-            ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
-            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
-            zc->maxNbLdmSequences = maxNbLdmSeq;
-
-            ZSTD_window_init(&zc->ldmState.window);
-            zc->ldmState.loadedDictEnd = 0;
-        }
 
         DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
-        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace));
+        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));
 
         zc->initialized = 1;
 
@@ -1980,7 +2254,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
         }
 
         params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize,
-                                                     cdict->dictContentSize, ZSTD_cpm_attachDict);
+                                                     cdict->dictContentSize, ZSTD_cpm_attachDict,
+                                                     params.useRowMatchFinder);
         params.cParams.windowLog = windowLog;
         params.useRowMatchFinder = cdict->useRowMatchFinder;    /* cdict overrides */
         FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrcSize,
@@ -2019,6 +2294,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
     return 0;
 }
 
+static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize,
+                                        ZSTD_compressionParameters const* cParams) {
+    if (ZSTD_CDictIndicesAreTagged(cParams)){
+        /* Remove tags from the CDict table if they are present.
+         * See docs on "short cache" in zstd_compress_internal.h for context. */
+        size_t i;
+        for (i = 0; i < tableSize; i++) {
+            U32 const taggedIndex = src[i];
+            U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS;
+            dst[i] = index;
+        }
+    } else {
+        ZSTD_memcpy(dst, src, tableSize * sizeof(U32));
+    }
+}
+
 static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                             const ZSTD_CDict* cdict,
                             ZSTD_CCtx_params params,
@@ -2054,21 +2345,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
                                                             : 0;
         size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
 
-        ZSTD_memcpy(cctx->blockState.matchState.hashTable,
-               cdict->matchState.hashTable,
-               hSize * sizeof(U32));
+        ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable,
+                                cdict->matchState.hashTable,
+                                hSize, cdict_cParams);
+
         /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */
         if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
-            ZSTD_memcpy(cctx->blockState.matchState.chainTable,
-               cdict->matchState.chainTable,
-               chainSize * sizeof(U32));
+            ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable,
+                                    cdict->matchState.chainTable,
+                                    chainSize, cdict_cParams);
         }
         /* copy tag table */
         if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) {
-            size_t const tagTableSize = hSize*sizeof(U16);
+            size_t const tagTableSize = hSize;
             ZSTD_memcpy(cctx->blockState.matchState.tagTable,
-                cdict->matchState.tagTable,
-                tagTableSize);
+                        cdict->matchState.tagTable,
+                        tagTableSize);
+            cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt;
         }
     }
 
@@ -2147,6 +2440,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
         params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter;
         params.ldmParams = srcCCtx->appliedParams.ldmParams;
         params.fParams = fParams;
+        params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize;
         ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
                                 /* loadedDictSize */ 0,
                                 ZSTDcrp_leaveDirty, zbuff);
@@ -2294,7 +2588,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par
 
 /* See doc/zstd_compression_format.md for detailed format description */
 
-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
 {
     const seqDef* const sequences = seqStorePtr->sequencesStart;
     BYTE* const llCodeTable = seqStorePtr->llCode;
@@ -2302,18 +2596,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
     BYTE* const mlCodeTable = seqStorePtr->mlCode;
     U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     U32 u;
+    int longOffsets = 0;
     assert(nbSeq <= seqStorePtr->maxNbSeq);
     for (u=0; u<nbSeq; u++) {
         U32 const llv = sequences[u].litLength;
+        U32 const ofCode = ZSTD_highbit32(sequences[u].offBase);
         U32 const mlv = sequences[u].mlBase;
         llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
-        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offBase);
+        ofCodeTable[u] = (BYTE)ofCode;
         mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
+        assert(!(MEM_64bits() && ofCode >= STREAM_ACCUMULATOR_MIN));
+        if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN)
+            longOffsets = 1;
     }
     if (seqStorePtr->longLengthType==ZSTD_llt_literalLength)
         llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
     if (seqStorePtr->longLengthType==ZSTD_llt_matchLength)
         mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
+    return longOffsets;
 }
 
 /* ZSTD_useTargetCBlockSize():
@@ -2347,6 +2647,7 @@ typedef struct {
     U32 MLtype;
     size_t size;
     size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */
+    int longOffsets;
 } ZSTD_symbolEncodingTypeStats_t;
 
 /* ZSTD_buildSequencesStatistics():
@@ -2357,11 +2658,13 @@ typedef struct {
  * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32)
  */
 static ZSTD_symbolEncodingTypeStats_t
-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
-                        const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
-                              BYTE* dst, const BYTE* const dstEnd,
-                              ZSTD_strategy strategy, unsigned* countWorkspace,
-                              void* entropyWorkspace, size_t entropyWkspSize) {
+ZSTD_buildSequencesStatistics(
+                const seqStore_t* seqStorePtr, size_t nbSeq,
+                const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy,
+                      BYTE* dst, const BYTE* const dstEnd,
+                      ZSTD_strategy strategy, unsigned* countWorkspace,
+                      void* entropyWorkspace, size_t entropyWkspSize)
+{
     BYTE* const ostart = dst;
     const BYTE* const oend = dstEnd;
     BYTE* op = ostart;
@@ -2375,7 +2678,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
 
     stats.lastCountSize = 0;
     /* convert length/distances into codes */
-    ZSTD_seqToCodes(seqStorePtr);
+    stats.longOffsets = ZSTD_seqToCodes(seqStorePtr);
     assert(op <= oend);
     assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */
     /* build CTable for Literal Lengths */
@@ -2480,22 +2783,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
  */
 #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
 MEM_STATIC size_t
-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
-                          const ZSTD_entropyCTables_t* prevEntropy,
-                                ZSTD_entropyCTables_t* nextEntropy,
-                          const ZSTD_CCtx_params* cctxParams,
-                                void* dst, size_t dstCapacity,
-                                void* entropyWorkspace, size_t entropyWkspSize,
-                          const int bmi2)
+ZSTD_entropyCompressSeqStore_internal(
+                        const seqStore_t* seqStorePtr,
+                        const ZSTD_entropyCTables_t* prevEntropy,
+                              ZSTD_entropyCTables_t* nextEntropy,
+                        const ZSTD_CCtx_params* cctxParams,
+                              void* dst, size_t dstCapacity,
+                              void* entropyWorkspace, size_t entropyWkspSize,
+                        const int bmi2)
 {
-    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
     unsigned* count = (unsigned*)entropyWorkspace;
     FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
     FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
     FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
     const seqDef* const sequences = seqStorePtr->sequencesStart;
-    const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
     const BYTE* const llCodeTable = seqStorePtr->llCode;
     const BYTE* const mlCodeTable = seqStorePtr->mlCode;
@@ -2503,29 +2806,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
     BYTE* const oend = ostart + dstCapacity;
     BYTE* op = ostart;
     size_t lastCountSize;
+    int longOffsets = 0;
 
     entropyWorkspace = count + (MaxSeq + 1);
     entropyWkspSize -= (MaxSeq + 1) * sizeof(*count);
 
-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq);
+    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity);
     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
     assert(entropyWkspSize >= HUF_WORKSPACE_SIZE);
 
     /* Compress literals */
     {   const BYTE* const literals = seqStorePtr->litStart;
-        size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart;
-        size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart;
+        size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+        size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
         /* Base suspicion of uncompressibility on ratio of literals to sequences */
         unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
         size_t const litSize = (size_t)(seqStorePtr->lit - literals);
+
         size_t const cSize = ZSTD_compressLiterals(
-                                    &prevEntropy->huf, &nextEntropy->huf,
-                                    cctxParams->cParams.strategy,
-                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
                                     op, dstCapacity,
                                     literals, litSize,
                                     entropyWorkspace, entropyWkspSize,
-                                    bmi2, suspectUncompressible);
+                                    &prevEntropy->huf, &nextEntropy->huf,
+                                    cctxParams->cParams.strategy,
+                                    ZSTD_literalsCompressionIsDisabled(cctxParams),
+                                    suspectUncompressible, bmi2);
         FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
         assert(cSize <= dstCapacity);
         op += cSize;
@@ -2551,11 +2856,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
         ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
         return (size_t)(op - ostart);
     }
-    {
-        ZSTD_symbolEncodingTypeStats_t stats;
-        BYTE* seqHead = op++;
+    {   BYTE* const seqHead = op++;
         /* build stats for sequences */
-        stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+        const ZSTD_symbolEncodingTypeStats_t stats =
+                ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
                                              &prevEntropy->fse, &nextEntropy->fse,
                                               op, oend,
                                               strategy, count,
@@ -2564,6 +2868,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
         *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2));
         lastCountSize = stats.lastCountSize;
         op += stats.size;
+        longOffsets = stats.longOffsets;
     }
 
     {   size_t const bitstreamSize = ZSTD_encodeSequences(
@@ -2598,14 +2903,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
 }
 
 MEM_STATIC size_t
-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
-                       const ZSTD_entropyCTables_t* prevEntropy,
-                             ZSTD_entropyCTables_t* nextEntropy,
-                       const ZSTD_CCtx_params* cctxParams,
-                             void* dst, size_t dstCapacity,
-                             size_t srcSize,
-                             void* entropyWorkspace, size_t entropyWkspSize,
-                             int bmi2)
+ZSTD_entropyCompressSeqStore(
+                    const seqStore_t* seqStorePtr,
+                    const ZSTD_entropyCTables_t* prevEntropy,
+                          ZSTD_entropyCTables_t* nextEntropy,
+                    const ZSTD_CCtx_params* cctxParams,
+                          void* dst, size_t dstCapacity,
+                          size_t srcSize,
+                          void* entropyWorkspace, size_t entropyWkspSize,
+                          int bmi2)
 {
     size_t const cSize = ZSTD_entropyCompressSeqStore_internal(
                             seqStorePtr, prevEntropy, nextEntropy, cctxParams,
@@ -2615,15 +2921,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
     /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
      * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
      */
-    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
+    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) {
+        DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity);
         return 0;  /* block not compressed */
+    }
     FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed");
 
     /* Check compressibility */
     {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
         if (cSize >= maxCSize) return 0;  /* block not compressed */
     }
-    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
+    DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
+    /* libzstd decoder before  > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly.
+     * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above.
+     */
+    assert(cSize < ZSTD_BLOCKSIZE_MAX);
     return cSize;
 }
 
@@ -2635,40 +2947,43 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS
     static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = {
         { ZSTD_compressBlock_fast  /* default for 0 */,
           ZSTD_compressBlock_fast,
-          ZSTD_compressBlock_doubleFast,
-          ZSTD_compressBlock_greedy,
-          ZSTD_compressBlock_lazy,
-          ZSTD_compressBlock_lazy2,
-          ZSTD_compressBlock_btlazy2,
-          ZSTD_compressBlock_btopt,
-          ZSTD_compressBlock_btultra,
-          ZSTD_compressBlock_btultra2 },
+          ZSTD_COMPRESSBLOCK_DOUBLEFAST,
+          ZSTD_COMPRESSBLOCK_GREEDY,
+          ZSTD_COMPRESSBLOCK_LAZY,
+          ZSTD_COMPRESSBLOCK_LAZY2,
+          ZSTD_COMPRESSBLOCK_BTLAZY2,
+          ZSTD_COMPRESSBLOCK_BTOPT,
+          ZSTD_COMPRESSBLOCK_BTULTRA,
+          ZSTD_COMPRESSBLOCK_BTULTRA2
+        },
         { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
           ZSTD_compressBlock_fast_extDict,
-          ZSTD_compressBlock_doubleFast_extDict,
-          ZSTD_compressBlock_greedy_extDict,
-          ZSTD_compressBlock_lazy_extDict,
-          ZSTD_compressBlock_lazy2_extDict,
-          ZSTD_compressBlock_btlazy2_extDict,
-          ZSTD_compressBlock_btopt_extDict,
-          ZSTD_compressBlock_btultra_extDict,
-          ZSTD_compressBlock_btultra_extDict },
+          ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT,
+          ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT,
+          ZSTD_COMPRESSBLOCK_LAZY_EXTDICT,
+          ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT,
+          ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT,
+          ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT,
+          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT,
+          ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT
+        },
         { ZSTD_compressBlock_fast_dictMatchState  /* default for 0 */,
           ZSTD_compressBlock_fast_dictMatchState,
-          ZSTD_compressBlock_doubleFast_dictMatchState,
-          ZSTD_compressBlock_greedy_dictMatchState,
-          ZSTD_compressBlock_lazy_dictMatchState,
-          ZSTD_compressBlock_lazy2_dictMatchState,
-          ZSTD_compressBlock_btlazy2_dictMatchState,
-          ZSTD_compressBlock_btopt_dictMatchState,
-          ZSTD_compressBlock_btultra_dictMatchState,
-          ZSTD_compressBlock_btultra_dictMatchState },
+          ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE,
+          ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE,
+          ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE,
+          ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE,
+          ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE,
+          ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE,
+          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE,
+          ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE
+        },
         { NULL  /* default for 0 */,
           NULL,
           NULL,
-          ZSTD_compressBlock_greedy_dedicatedDictSearch,
-          ZSTD_compressBlock_lazy_dedicatedDictSearch,
-          ZSTD_compressBlock_lazy2_dedicatedDictSearch,
+          ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH,
+          ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH,
+          ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH,
           NULL,
           NULL,
           NULL,
@@ -2681,18 +2996,26 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS
     DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
     if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) {
         static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = {
-            { ZSTD_compressBlock_greedy_row,
-            ZSTD_compressBlock_lazy_row,
-            ZSTD_compressBlock_lazy2_row },
-            { ZSTD_compressBlock_greedy_extDict_row,
-            ZSTD_compressBlock_lazy_extDict_row,
-            ZSTD_compressBlock_lazy2_extDict_row },
-            { ZSTD_compressBlock_greedy_dictMatchState_row,
-            ZSTD_compressBlock_lazy_dictMatchState_row,
-            ZSTD_compressBlock_lazy2_dictMatchState_row },
-            { ZSTD_compressBlock_greedy_dedicatedDictSearch_row,
-            ZSTD_compressBlock_lazy_dedicatedDictSearch_row,
-            ZSTD_compressBlock_lazy2_dedicatedDictSearch_row }
+            {
+                ZSTD_COMPRESSBLOCK_GREEDY_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY2_ROW
+            },
+            {
+                ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW
+            },
+            {
+                ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW
+            },
+            {
+                ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW,
+                ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW
+            }
         };
         DEBUGLOG(4, "Selecting a row-based matchfinder");
         assert(useRowMatchFinder != ZSTD_ps_auto);
@@ -2718,6 +3041,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
     ssPtr->longLengthType = ZSTD_llt_none;
 }
 
+/* ZSTD_postProcessSequenceProducerResult() :
+ * Validates and post-processes sequences obtained through the external matchfinder API:
+ *   - Checks whether nbExternalSeqs represents an error condition.
+ *   - Appends a block delimiter to outSeqs if one is not already present.
+ *     See zstd.h for context regarding block delimiters.
+ * Returns the number of sequences after post-processing, or an error code. */
+static size_t ZSTD_postProcessSequenceProducerResult(
+    ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize
+) {
+    RETURN_ERROR_IF(
+        nbExternalSeqs > outSeqsCapacity,
+        sequenceProducer_failed,
+        "External sequence producer returned error code %lu",
+        (unsigned long)nbExternalSeqs
+    );
+
+    RETURN_ERROR_IF(
+        nbExternalSeqs == 0 && srcSize > 0,
+        sequenceProducer_failed,
+        "Got zero sequences from external sequence producer for a non-empty src buffer!"
+    );
+
+    if (srcSize == 0) {
+        ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence));
+        return 1;
+    }
+
+    {
+        ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1];
+
+        /* We can return early if lastSeq is already a block delimiter. */
+        if (lastSeq.offset == 0 && lastSeq.matchLength == 0) {
+            return nbExternalSeqs;
+        }
+
+        /* This error condition is only possible if the external matchfinder
+         * produced an invalid parse, by definition of ZSTD_sequenceBound(). */
+        RETURN_ERROR_IF(
+            nbExternalSeqs == outSeqsCapacity,
+            sequenceProducer_failed,
+            "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!"
+        );
+
+        /* lastSeq is not a block delimiter, so we need to append one. */
+        ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence));
+        return nbExternalSeqs + 1;
+    }
+}
+
+/* ZSTD_fastSequenceLengthSum() :
+ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*.
+ * Similar to another function in zstd_compress.c (determine_blockSize),
+ * except it doesn't check for a block delimiter to end summation.
+ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P).
+ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */
+static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) {
+    size_t matchLenSum, litLenSum, i;
+    matchLenSum = 0;
+    litLenSum = 0;
+    for (i = 0; i < seqBufSize; i++) {
+        litLenSum += seqBuf[i].litLength;
+        matchLenSum += seqBuf[i].matchLength;
+    }
+    return litLenSum + matchLenSum;
+}
+
 typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
 
 static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
@@ -2727,7 +3116,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
     assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
     /* Assert that we have correctly flushed the ctx params into the ms's copy */
     ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
-    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
+    /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+     * additional 1. We need to revisit and change this logic to be more consistent */
+    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
         if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) {
             ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize);
         } else {
@@ -2763,6 +3154,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
         }
         if (zc->externSeqStore.pos < zc->externSeqStore.size) {
             assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable);
+
+            /* External matchfinder + LDM is technically possible, just not implemented yet.
+             * We need to revisit soon and implement it. */
+            RETURN_ERROR_IF(
+                ZSTD_hasExtSeqProd(&zc->appliedParams),
+                parameter_combination_unsupported,
+                "Long-distance matching with external sequence producer enabled is not currently supported."
+            );
+
             /* Updates ldmSeqStore.pos */
             lastLLSize =
                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
@@ -2774,6 +3174,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
         } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) {
             rawSeqStore_t ldmSeqStore = kNullRawSeqStore;
 
+            /* External matchfinder + LDM is technically possible, just not implemented yet.
+             * We need to revisit soon and implement it. */
+            RETURN_ERROR_IF(
+                ZSTD_hasExtSeqProd(&zc->appliedParams),
+                parameter_combination_unsupported,
+                "Long-distance matching with external sequence producer enabled is not currently supported."
+            );
+
             ldmSeqStore.seq = zc->ldmSequences;
             ldmSeqStore.capacity = zc->maxNbLdmSequences;
             /* Updates ldmSeqStore.size */
@@ -2788,10 +3196,74 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
                                        zc->appliedParams.useRowMatchFinder,
                                        src, srcSize);
             assert(ldmSeqStore.pos == ldmSeqStore.size);
-        } else {   /* not long range mode */
-            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy,
-                                                                                    zc->appliedParams.useRowMatchFinder,
-                                                                                    dictMode);
+        } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) {
+            assert(
+                zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize)
+            );
+            assert(zc->appliedParams.extSeqProdFunc != NULL);
+
+            {   U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog;
+
+                size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)(
+                    zc->appliedParams.extSeqProdState,
+                    zc->extSeqBuf,
+                    zc->extSeqBufCapacity,
+                    src, srcSize,
+                    NULL, 0,  /* dict and dictSize, currently not supported */
+                    zc->appliedParams.compressionLevel,
+                    windowSize
+                );
+
+                size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult(
+                    zc->extSeqBuf,
+                    nbExternalSeqs,
+                    zc->extSeqBufCapacity,
+                    srcSize
+                );
+
+                /* Return early if there is no error, since we don't need to worry about last literals */
+                if (!ZSTD_isError(nbPostProcessedSeqs)) {
+                    ZSTD_sequencePosition seqPos = {0,0,0};
+                    size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs);
+                    RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!");
+                    FORWARD_IF_ERROR(
+                        ZSTD_copySequencesToSeqStoreExplicitBlockDelim(
+                            zc, &seqPos,
+                            zc->extSeqBuf, nbPostProcessedSeqs,
+                            src, srcSize,
+                            zc->appliedParams.searchForExternalRepcodes
+                        ),
+                        "Failed to copy external sequences to seqStore!"
+                    );
+                    ms->ldmSeqStore = NULL;
+                    DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs);
+                    return ZSTDbss_compress;
+                }
+
+                /* Propagate the error if fallback is disabled */
+                if (!zc->appliedParams.enableMatchFinderFallback) {
+                    return nbPostProcessedSeqs;
+                }
+
+                /* Fallback to software matchfinder */
+                {   ZSTD_blockCompressor const blockCompressor =
+                        ZSTD_selectBlockCompressor(
+                            zc->appliedParams.cParams.strategy,
+                            zc->appliedParams.useRowMatchFinder,
+                            dictMode);
+                    ms->ldmSeqStore = NULL;
+                    DEBUGLOG(
+                        5,
+                        "External sequence producer returned error code %lu. Falling back to internal parser.",
+                        (unsigned long)nbExternalSeqs
+                    );
+                    lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+            }   }
+        } else {   /* not long range mode and no external matchfinder */
+            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(
+                    zc->appliedParams.cParams.strategy,
+                    zc->appliedParams.useRowMatchFinder,
+                    dictMode);
             ms->ldmSeqStore = NULL;
             lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
         }
@@ -2801,29 +3273,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
     return ZSTDbss_compress;
 }
 
-static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM])
 {
-    const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
-    const seqDef* seqStoreSeqs = seqStore->sequencesStart;
-    size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs;
-    size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart);
-    size_t literalsRead = 0;
-    size_t lastLLSize;
+    const seqDef* inSeqs = seqStore->sequencesStart;
+    const size_t nbInSequences = seqStore->sequences - inSeqs;
+    const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart);
 
-    ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
+    ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex;
+    const size_t nbOutSequences = nbInSequences + 1;
+    size_t nbOutLiterals = 0;
+    repcodes_t repcodes;
     size_t i;
-    repcodes_t updatedRepcodes;
 
-    assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
-    /* Ensure we have enough space for last literals "sequence" */
-    assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1);
-    ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
-    for (i = 0; i < seqStoreSeqSize; ++i) {
-        U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM;
-        outSeqs[i].litLength = seqStoreSeqs[i].litLength;
-        outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH;
+    /* Bounds check that we have enough space for every input sequence
+     * and the block delimiter
+     */
+    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
+    RETURN_ERROR_IF(
+        nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex),
+        dstSize_tooSmall,
+        "Not enough space to copy sequences");
+
+    ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes));
+    for (i = 0; i < nbInSequences; ++i) {
+        U32 rawOffset;
+        outSeqs[i].litLength = inSeqs[i].litLength;
+        outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH;
         outSeqs[i].rep = 0;
 
+        /* Handle the possible single length >= 64K
+         * There can only be one because we add MINMATCH to every match length,
+         * and blocks are at most 128K.
+         */
         if (i == seqStore->longLengthPos) {
             if (seqStore->longLengthType == ZSTD_llt_literalLength) {
                 outSeqs[i].litLength += 0x10000;
@@ -2832,37 +3313,55 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
             }
         }
 
-        if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) {
-            /* Derive the correct offset corresponding to a repcode */
-            outSeqs[i].rep = seqStoreSeqs[i].offBase;
+        /* Determine the raw offset given the offBase, which may be a repcode. */
+        if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) {
+            const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase);
+            assert(repcode > 0);
+            outSeqs[i].rep = repcode;
             if (outSeqs[i].litLength != 0) {
-                rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1];
+                rawOffset = repcodes.rep[repcode - 1];
             } else {
-                if (outSeqs[i].rep == 3) {
-                    rawOffset = updatedRepcodes.rep[0] - 1;
+                if (repcode == 3) {
+                    assert(repcodes.rep[0] > 1);
+                    rawOffset = repcodes.rep[0] - 1;
                 } else {
-                    rawOffset = updatedRepcodes.rep[outSeqs[i].rep];
+                    rawOffset = repcodes.rep[repcode];
                 }
             }
+        } else {
+            rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase);
         }
         outSeqs[i].offset = rawOffset;
-        /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode
-           so we provide seqStoreSeqs[i].offset - 1 */
-        ZSTD_updateRep(updatedRepcodes.rep,
-                       seqStoreSeqs[i].offBase - 1,
-                       seqStoreSeqs[i].litLength == 0);
-        literalsRead += outSeqs[i].litLength;
+
+        /* Update repcode history for the sequence */
+        ZSTD_updateRep(repcodes.rep,
+                       inSeqs[i].offBase,
+                       inSeqs[i].litLength == 0);
+
+        nbOutLiterals += outSeqs[i].litLength;
     }
     /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0.
      * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker
      * for the block boundary, according to the API.
      */
-    assert(seqStoreLiteralsSize >= literalsRead);
-    lastLLSize = seqStoreLiteralsSize - literalsRead;
-    outSeqs[i].litLength = (U32)lastLLSize;
-    outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0;
-    seqStoreSeqSize++;
-    zc->seqCollector.seqIndex += seqStoreSeqSize;
+    assert(nbInLiterals >= nbOutLiterals);
+    {
+        const size_t lastLLSize = nbInLiterals - nbOutLiterals;
+        outSeqs[nbInSequences].litLength = (U32)lastLLSize;
+        outSeqs[nbInSequences].matchLength = 0;
+        outSeqs[nbInSequences].offset = 0;
+        assert(nbOutSequences == nbInSequences + 1);
+    }
+    seqCollector->seqIndex += nbOutSequences;
+    assert(seqCollector->seqIndex <= seqCollector->maxSequences);
+
+    return 0;
+}
+
+size_t ZSTD_sequenceBound(size_t srcSize) {
+    const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1;
+    const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1;
+    return maxNbSeq + maxNbDelims;
 }
 
 size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
@@ -2871,6 +3370,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
     const size_t dstCapacity = ZSTD_compressBound(srcSize);
     void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem);
     SeqCollector seqCollector;
+    {
+        int targetCBlockSize;
+        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), "");
+        RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0");
+    }
+    {
+        int nbWorkers;
+        FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), "");
+        RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0");
+    }
 
     RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");
 
@@ -2880,8 +3389,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
     seqCollector.maxSequences = outSeqsSize;
     zc->seqCollector = seqCollector;
 
-    ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
-    ZSTD_customFree(dst, ZSTD_defaultCMem);
+    {
+        const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
+        ZSTD_customFree(dst, ZSTD_defaultCMem);
+        FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed");
+    }
+    assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize));
     return zc->seqCollector.seqIndex;
 }
 
@@ -2910,19 +3423,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) {
     const size_t unrollMask = unrollSize - 1;
     const size_t prefixLength = length & unrollMask;
     size_t i;
-    size_t u;
     if (length == 1) return 1;
     /* Check if prefix is RLE first before using unrolled loop */
     if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) {
         return 0;
     }
     for (i = prefixLength; i != length; i += unrollSize) {
+        size_t u;
         for (u = 0; u < unrollSize; u += sizeof(size_t)) {
             if (MEM_readST(ip + i + u) != valueST) {
                 return 0;
-            }
-        }
-    }
+    }   }   }
     return 1;
 }
 
@@ -2938,7 +3449,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore)
     return nbSeqs < 4 && nbLits < 10;
 }
 
-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
+static void
+ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs)
 {
     ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock;
     bs->prevCBlock = bs->nextCBlock;
@@ -2946,7 +3458,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c
 }
 
 /* Writes the block header */
-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) {
+static void
+writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock)
+{
     U32 const cBlockHeader = cSize == 1 ?
                         lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
                         lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
@@ -2959,13 +3473,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB
  *  Stores literals block type (raw, rle, compressed, repeat) and
  *  huffman description table to hufMetadata.
  *  Requires ENTROPY_WORKSPACE_SIZE workspace
- *  @return : size of huffman description table or error code */
-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
-                                            const ZSTD_hufCTables_t* prevHuf,
-                                                  ZSTD_hufCTables_t* nextHuf,
-                                                  ZSTD_hufCTablesMetadata_t* hufMetadata,
-                                                  const int literalsCompressionIsDisabled,
-                                                  void* workspace, size_t wkspSize)
+ * @return : size of huffman description table, or an error code
+ */
+static size_t
+ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize,
+                               const ZSTD_hufCTables_t* prevHuf,
+                                     ZSTD_hufCTables_t* nextHuf,
+                                     ZSTD_hufCTablesMetadata_t* hufMetadata,
+                               const int literalsCompressionIsDisabled,
+                                     void* workspace, size_t wkspSize,
+                                     int hufFlags)
 {
     BYTE* const wkspStart = (BYTE*)workspace;
     BYTE* const wkspEnd = wkspStart + wkspSize;
@@ -2973,9 +3490,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
     unsigned* const countWksp = (unsigned*)workspace;
     const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
     BYTE* const nodeWksp = countWkspStart + countWkspSize;
-    const size_t nodeWkspSize = wkspEnd-nodeWksp;
+    const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp);
     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
-    unsigned huffLog = HUF_TABLELOG_DEFAULT;
+    unsigned huffLog = LitHufLog;
     HUF_repeat repeat = prevHuf->repeatMode;
     DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize);
 
@@ -2990,73 +3507,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
 
     /* small ? don't even attempt compression (speed opt) */
 #ifndef COMPRESS_LITERALS_SIZE_MIN
-#define COMPRESS_LITERALS_SIZE_MIN 63
+# define COMPRESS_LITERALS_SIZE_MIN 63  /* heuristic */
 #endif
     {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
         if (srcSize <= minLitSize) {
             DEBUGLOG(5, "set_basic - too small");
             hufMetadata->hType = set_basic;
             return 0;
-        }
-    }
+    }   }
 
     /* Scan input and build symbol stats */
-    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
+    {   size_t const largest =
+            HIST_count_wksp (countWksp, &maxSymbolValue,
+                            (const BYTE*)src, srcSize,
+                            workspace, wkspSize);
         FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
         if (largest == srcSize) {
+            /* only one literal symbol */
             DEBUGLOG(5, "set_rle");
             hufMetadata->hType = set_rle;
             return 0;
         }
         if (largest <= (srcSize >> 7)+4) {
+            /* heuristic: likely not compressible */
             DEBUGLOG(5, "set_basic - no gain");
             hufMetadata->hType = set_basic;
             return 0;
-        }
-    }
+    }   }
 
     /* Validate the previous Huffman table */
-    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+    if (repeat == HUF_repeat_check
+      && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
         repeat = HUF_repeat_none;
     }
 
     /* Build Huffman Tree */
     ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
-    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags);
+    assert(huffLog <= LitHufLog);
     {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
                                                     maxSymbolValue, huffLog,
                                                     nodeWksp, nodeWkspSize);
         FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
         huffLog = (U32)maxBits;
-        {   /* Build and write the CTable */
-            size_t const newCSize = HUF_estimateCompressedSize(
-                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
-            size_t const hSize = HUF_writeCTable_wksp(
-                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
-                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
-                    nodeWksp, nodeWkspSize);
-            /* Check against repeating the previous CTable */
-            if (repeat != HUF_repeat_none) {
-                size_t const oldCSize = HUF_estimateCompressedSize(
-                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
-                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
-                    DEBUGLOG(5, "set_repeat - smaller");
-                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-                    hufMetadata->hType = set_repeat;
-                    return 0;
-                }
-            }
-            if (newCSize + hSize >= srcSize) {
-                DEBUGLOG(5, "set_basic - no gains");
+    }
+    {   /* Build and write the CTable */
+        size_t const newCSize = HUF_estimateCompressedSize(
+                (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+        size_t const hSize = HUF_writeCTable_wksp(
+                hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+                (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+                nodeWksp, nodeWkspSize);
+        /* Check against repeating the previous CTable */
+        if (repeat != HUF_repeat_none) {
+            size_t const oldCSize = HUF_estimateCompressedSize(
+                    (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+            if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+                DEBUGLOG(5, "set_repeat - smaller");
                 ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-                hufMetadata->hType = set_basic;
+                hufMetadata->hType = set_repeat;
                 return 0;
-            }
-            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
-            hufMetadata->hType = set_compressed;
-            nextHuf->repeatMode = HUF_repeat_check;
-            return hSize;
+        }   }
+        if (newCSize + hSize >= srcSize) {
+            DEBUGLOG(5, "set_basic - no gains");
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            hufMetadata->hType = set_basic;
+            return 0;
         }
+        DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+        hufMetadata->hType = set_compressed;
+        nextHuf->repeatMode = HUF_repeat_check;
+        return hSize;
     }
 }
 
@@ -3066,8 +3587,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi
  * and updates nextEntropy to the appropriate repeatMode.
  */
 static ZSTD_symbolEncodingTypeStats_t
-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
-    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0};
+ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy)
+{
+    ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0};
     nextEntropy->litlength_repeatMode = FSE_repeat_none;
     nextEntropy->offcode_repeatMode = FSE_repeat_none;
     nextEntropy->matchlength_repeatMode = FSE_repeat_none;
@@ -3078,16 +3600,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
  *  Builds entropy for the sequences.
  *  Stores symbol compression modes and fse table to fseMetadata.
  *  Requires ENTROPY_WORKSPACE_SIZE wksp.
- *  @return : size of fse tables or error code */
-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
-                                              const ZSTD_fseCTables_t* prevEntropy,
-                                                    ZSTD_fseCTables_t* nextEntropy,
-                                              const ZSTD_CCtx_params* cctxParams,
-                                                    ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                                    void* workspace, size_t wkspSize)
+ * @return : size of fse tables or error code */
+static size_t
+ZSTD_buildBlockEntropyStats_sequences(
+                const seqStore_t* seqStorePtr,
+                const ZSTD_fseCTables_t* prevEntropy,
+                      ZSTD_fseCTables_t* nextEntropy,
+                const ZSTD_CCtx_params* cctxParams,
+                      ZSTD_fseCTablesMetadata_t* fseMetadata,
+                      void* workspace, size_t wkspSize)
 {
     ZSTD_strategy const strategy = cctxParams->cParams.strategy;
-    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
     BYTE* const ostart = fseMetadata->fseTablesBuffer;
     BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
     BYTE* op = ostart;
@@ -3114,23 +3638,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr,
 /* ZSTD_buildBlockEntropyStats() :
  *  Builds entropy for the block.
  *  Requires workspace size ENTROPY_WORKSPACE_SIZE
- *
- *  @return : 0 on success or error code
+ * @return : 0 on success, or an error code
+ *  Note : also employed in superblock
  */
-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
-                             const ZSTD_entropyCTables_t* prevEntropy,
-                                   ZSTD_entropyCTables_t* nextEntropy,
-                             const ZSTD_CCtx_params* cctxParams,
-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                   void* workspace, size_t wkspSize)
-{
-    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
+size_t ZSTD_buildBlockEntropyStats(
+            const seqStore_t* seqStorePtr,
+            const ZSTD_entropyCTables_t* prevEntropy,
+                  ZSTD_entropyCTables_t* nextEntropy,
+            const ZSTD_CCtx_params* cctxParams,
+                  ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                  void* workspace, size_t wkspSize)
+{
+    size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart);
+    int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD);
+    int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0;
+
     entropyMetadata->hufMetadata.hufDesSize =
         ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize,
                                             &prevEntropy->huf, &nextEntropy->huf,
                                             &entropyMetadata->hufMetadata,
                                             ZSTD_literalsCompressionIsDisabled(cctxParams),
-                                            workspace, wkspSize);
+                                            workspace, wkspSize, hufFlags);
+
     FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed");
     entropyMetadata->fseMetadata.fseTablesSize =
         ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
@@ -3143,11 +3672,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
 }
 
 /* Returns the size estimate for the literals section (header + content) of a block */
-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
-                                                const ZSTD_hufCTables_t* huf,
-                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
-                                                void* workspace, size_t wkspSize,
-                                                int writeEntropy)
+static size_t
+ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize,
+                               const ZSTD_hufCTables_t* huf,
+                               const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                               void* workspace, size_t wkspSize,
+                               int writeEntropy)
 {
     unsigned* const countWksp = (unsigned*)workspace;
     unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX;
@@ -3169,12 +3699,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz
 }
 
 /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */
-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
-                        const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
-                        const FSE_CTable* fseCTable,
-                        const U8* additionalBits,
-                        short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
-                        void* workspace, size_t wkspSize)
+static size_t
+ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+                    const BYTE* codeTable, size_t nbSeq, unsigned maxCode,
+                    const FSE_CTable* fseCTable,
+                    const U8* additionalBits,
+                    short const* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                    void* workspace, size_t wkspSize)
 {
     unsigned* const countWksp = (unsigned*)workspace;
     const BYTE* ctp = codeTable;
@@ -3206,99 +3737,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
 }
 
 /* Returns the size estimate for the sequences section (header + content) of a block */
-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
-                                                  const BYTE* llCodeTable,
-                                                  const BYTE* mlCodeTable,
-                                                  size_t nbSeq,
-                                                  const ZSTD_fseCTables_t* fseTables,
-                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                                  void* workspace, size_t wkspSize,
-                                                  int writeEntropy)
+static size_t
+ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+                                 const BYTE* llCodeTable,
+                                 const BYTE* mlCodeTable,
+                                 size_t nbSeq,
+                                 const ZSTD_fseCTables_t* fseTables,
+                                 const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                 void* workspace, size_t wkspSize,
+                                 int writeEntropy)
 {
     size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ);
     size_t cSeqSizeEstimate = 0;
     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff,
-                                         fseTables->offcodeCTable, NULL,
-                                         OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
-                                         workspace, wkspSize);
+                                    fseTables->offcodeCTable, NULL,
+                                    OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                                    workspace, wkspSize);
     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL,
-                                         fseTables->litlengthCTable, LL_bits,
-                                         LL_defaultNorm, LL_defaultNormLog, MaxLL,
-                                         workspace, wkspSize);
+                                    fseTables->litlengthCTable, LL_bits,
+                                    LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                                    workspace, wkspSize);
     cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML,
-                                         fseTables->matchlengthCTable, ML_bits,
-                                         ML_defaultNorm, ML_defaultNormLog, MaxML,
-                                         workspace, wkspSize);
+                                    fseTables->matchlengthCTable, ML_bits,
+                                    ML_defaultNorm, ML_defaultNormLog, MaxML,
+                                    workspace, wkspSize);
     if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
     return cSeqSizeEstimate + sequencesSectionHeaderSize;
 }
 
 /* Returns the size estimate for a given stream of literals, of, ll, ml */
-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
-                                     const BYTE* ofCodeTable,
-                                     const BYTE* llCodeTable,
-                                     const BYTE* mlCodeTable,
-                                     size_t nbSeq,
-                                     const ZSTD_entropyCTables_t* entropy,
-                                     const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                     void* workspace, size_t wkspSize,
-                                     int writeLitEntropy, int writeSeqEntropy) {
+static size_t
+ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+                       const BYTE* ofCodeTable,
+                       const BYTE* llCodeTable,
+                       const BYTE* mlCodeTable,
+                       size_t nbSeq,
+                       const ZSTD_entropyCTables_t* entropy,
+                       const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                       void* workspace, size_t wkspSize,
+                       int writeLitEntropy, int writeSeqEntropy)
+{
     size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize,
-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
-                                                         workspace, wkspSize, writeLitEntropy);
+                                    &entropy->huf, &entropyMetadata->hufMetadata,
+                                    workspace, wkspSize, writeLitEntropy);
     size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
-                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
-                                                         workspace, wkspSize, writeSeqEntropy);
+                                    nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                    workspace, wkspSize, writeSeqEntropy);
     return seqSize + literalsSize + ZSTD_blockHeaderSize;
 }
 
 /* Builds entropy statistics and uses them for blocksize estimation.
  *
- * Returns the estimated compressed size of the seqStore, or a zstd error.
+ * @return: estimated compressed size of the seqStore, or a zstd error.
  */
-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) {
-    ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
+static size_t
+ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc)
+{
+    ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata;
     DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
                     &zc->blockState.prevCBlock->entropy,
                     &zc->blockState.nextCBlock->entropy,
                     &zc->appliedParams,
                     entropyMetadata,
-                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
-    return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
+                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), "");
+    return ZSTD_estimateBlockSize(
+                    seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart),
                     seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
                     (size_t)(seqStore->sequences - seqStore->sequencesStart),
-                    &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
+                    &zc->blockState.nextCBlock->entropy,
+                    entropyMetadata,
+                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
                     (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1);
 }
 
 /* Returns literals bytes represented in a seqStore */
-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) {
+static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore)
+{
     size_t literalsBytes = 0;
-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
+    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
     size_t i;
     for (i = 0; i < nbSeqs; ++i) {
-        seqDef seq = seqStore->sequencesStart[i];
+        seqDef const seq = seqStore->sequencesStart[i];
         literalsBytes += seq.litLength;
         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) {
             literalsBytes += 0x10000;
-        }
-    }
+    }   }
     return literalsBytes;
 }
 
 /* Returns match bytes represented in a seqStore */
-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
+static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore)
+{
     size_t matchBytes = 0;
-    size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart;
+    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
     size_t i;
     for (i = 0; i < nbSeqs; ++i) {
         seqDef seq = seqStore->sequencesStart[i];
         matchBytes += seq.mlBase + MINMATCH;
         if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) {
             matchBytes += 0x10000;
-        }
-    }
+    }   }
     return matchBytes;
 }
 
@@ -3307,15 +3846,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) {
  */
 static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
                                const seqStore_t* originalSeqStore,
-                                     size_t startIdx, size_t endIdx) {
-    BYTE* const litEnd = originalSeqStore->lit;
-    size_t literalsBytes;
-    size_t literalsBytesPreceding = 0;
-
+                                     size_t startIdx, size_t endIdx)
+{
     *resultSeqStore = *originalSeqStore;
     if (startIdx > 0) {
         resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx;
-        literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+        resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
     }
 
     /* Move longLengthPos into the correct position if necessary */
@@ -3328,13 +3864,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
     }
     resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx;
     resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx;
-    literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
-    resultSeqStore->litStart += literalsBytesPreceding;
     if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) {
         /* This accounts for possible last literals if the derived chunk reaches the end of the block */
-        resultSeqStore->lit = litEnd;
+        assert(resultSeqStore->lit == originalSeqStore->lit);
     } else {
-        resultSeqStore->lit = resultSeqStore->litStart+literalsBytes;
+        size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+        resultSeqStore->lit = resultSeqStore->litStart + literalsBytes;
     }
     resultSeqStore->llCode += startIdx;
     resultSeqStore->mlCode += startIdx;
@@ -3342,20 +3877,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
 }
 
 /*
- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history.
- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq().
+ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history.
+ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq().
  */
 static U32
-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0)
-{
-    U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0;  /* [ 0 - 3 ] */
-    assert(STORED_IS_REPCODE(offCode));
-    if (adjustedOffCode == ZSTD_REP_NUM) {
-        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */
-        assert(rep[0] > 0);
+ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0)
+{
+    U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;  /* [ 0 - 3 ] */
+    assert(OFFBASE_IS_REPCODE(offBase));
+    if (adjustedRepCode == ZSTD_REP_NUM) {
+        assert(ll0);
+        /* litlength == 0 and offCode == 2 implies selection of first repcode - 1
+         * This is only valid if it results in a valid offset value, aka > 0.
+         * Note : it may happen that `rep[0]==1` in exceptional circumstances.
+         * In which case this function will return 0, which is an invalid offset.
+         * It's not an issue though, since this value will be
+         * compared and discarded within ZSTD_seqStore_resolveOffCodes().
+         */
         return rep[0] - 1;
     }
-    return rep[adjustedOffCode];
+    return rep[adjustedRepCode];
 }
 
 /*
@@ -3371,30 +3912,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c
  *        1-3 : repcode 1-3
  *        4+ : real_offset+3
  */
-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
-                                          seqStore_t* const seqStore, U32 const nbSeq) {
+static void
+ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes,
+                        const seqStore_t* const seqStore, U32 const nbSeq)
+{
     U32 idx = 0;
+    U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
     for (; idx < nbSeq; ++idx) {
         seqDef* const seq = seqStore->sequencesStart + idx;
-        U32 const ll0 = (seq->litLength == 0);
-        U32 const offCode = OFFBASE_TO_STORED(seq->offBase);
-        assert(seq->offBase > 0);
-        if (STORED_IS_REPCODE(offCode)) {
-            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0);
-            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0);
+        U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
+        U32 const offBase = seq->offBase;
+        assert(offBase > 0);
+        if (OFFBASE_IS_REPCODE(offBase)) {
+            U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0);
+            U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0);
             /* Adjust simulated decompression repcode history if we come across a mismatch. Replace
              * the repcode with the offset it actually references, determined by the compression
              * repcode history.
              */
             if (dRawOffset != cRawOffset) {
-                seq->offBase = cRawOffset + ZSTD_REP_NUM;
+                seq->offBase = OFFSET_TO_OFFBASE(cRawOffset);
             }
         }
         /* Compression repcode history is always updated with values directly from the unmodified seqStore.
          * Decompression repcode history may use modified seq->offset value taken from compression repcode history.
          */
-        ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0);
-        ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
+        ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0);
+        ZSTD_updateRep(cRepcodes->rep, offBase, ll0);
     }
 }
 
@@ -3404,10 +3948,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_
  * Returns the total size of that block (including header) or a ZSTD error code.
  */
 static size_t
-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
+ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc,
+                            const seqStore_t* const seqStore,
                                   repcodes_t* const dRep, repcodes_t* const cRep,
                                   void* dst, size_t dstCapacity,
-                                  const void* src, size_t srcSize,
+                            const void* src, size_t srcSize,
                                   U32 lastBlock, U32 isPartition)
 {
     const U32 rleMaxLength = 25;
@@ -3442,8 +3987,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore,
         cSeqsSize = 1;
     }
 
+    /* Sequence collection not supported when block splitting */
     if (zc->seqCollector.collectSequences) {
-        ZSTD_copyBlockSequences(zc);
+        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed");
         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
         return 0;
     }
@@ -3481,45 +4027,49 @@ typedef struct {
 
 /* Helper function to perform the recursive search for block splits.
  * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half.
- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then
- * we do not recurse.
+ * If advantageous to split, then we recurse down the two sub-blocks.
+ * If not, or if an error occurred in estimation, then we do not recurse.
  *
- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+ * Note: The recursion depth is capped by a heuristic minimum number of sequences,
+ * defined by MIN_SEQUENCES_BLOCK_SPLITTING.
  * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
  * In practice, recursion depth usually doesn't go beyond 4.
  *
- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
+ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS.
+ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize
  * maximum of 128 KB, this value is actually impossible to reach.
  */
 static void
 ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx,
                              ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
 {
-    seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
-    seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
-    seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
+    seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk;
+    seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore;
+    seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore;
     size_t estimatedOriginalSize;
     size_t estimatedFirstHalfSize;
     size_t estimatedSecondHalfSize;
     size_t midIdx = (startIdx + endIdx)/2;
 
+    DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
+    assert(endIdx >= startIdx);
     if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) {
-        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
+        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx);
         return;
     }
-    DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx);
     ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx);
     ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx);
     ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx);
     estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc);
     estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc);
     estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc);
-    DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
+    DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu",
              estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize);
     if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
         return;
     }
     if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) {
+        DEBUGLOG(5, "split decided at seqNb:%zu", midIdx);
         ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore);
         splits->splitLocations[splits->idx] = (U32)midIdx;
         splits->idx++;
@@ -3527,14 +4077,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end
     }
 }
 
-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio.
+/* Base recursive function.
+ * Populates a table with intra-block partition indices that can improve compression ratio.
  *
- * Returns the number of splits made (which equals the size of the partition table - 1).
+ * @return: number of splits made (which equals the size of the partition table - 1).
  */
-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) {
-    seqStoreSplits splits = {partitions, 0};
+static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
+{
+    seqStoreSplits splits;
+    splits.splitLocations = partitions;
+    splits.idx = 0;
     if (nbSeq <= 4) {
-        DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split");
+        DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq);
         /* Refuse to try and split anything with less than 4 sequences */
         return 0;
     }
@@ -3550,18 +4104,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq)
  * Returns combined size of all blocks (which includes headers), or a ZSTD error code.
  */
 static size_t
-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity,
-                                       const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq)
+ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc,
+                                    void* dst, size_t dstCapacity,
+                              const void* src, size_t blockSize,
+                                    U32 lastBlock, U32 nbSeq)
 {
     size_t cSize = 0;
     const BYTE* ip = (const BYTE*)src;
     BYTE* op = (BYTE*)dst;
     size_t i = 0;
     size_t srcBytesTotal = 0;
-    U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
-    seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
-    seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore;
-    size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+    U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */
+    seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore;
+    seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore;
+    size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
 
     /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history
      * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two
@@ -3583,30 +4139,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
     ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t));
     ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
 
-    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+    DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
                 (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
                 (unsigned)zc->blockState.matchState.nextToUpdate);
 
     if (numSplits == 0) {
-        size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
-                                                                   &dRep, &cRep,
-                                                                    op, dstCapacity,
-                                                                    ip, blockSize,
-                                                                    lastBlock, 0 /* isPartition */);
+        size_t cSizeSingleBlock =
+            ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore,
+                                            &dRep, &cRep,
+                                            op, dstCapacity,
+                                            ip, blockSize,
+                                            lastBlock, 0 /* isPartition */);
         FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!");
         DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
-        assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
+        assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX);
+        assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize);
         return cSizeSingleBlock;
     }
 
     ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]);
     for (i = 0; i <= numSplits; ++i) {
-        size_t srcBytes;
         size_t cSizeChunk;
         U32 const lastPartition = (i == numSplits);
         U32 lastBlockEntireSrc = 0;
 
-        srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
+        size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore);
         srcBytesTotal += srcBytes;
         if (lastPartition) {
             /* This is the final partition, need to account for possible last literals */
@@ -3621,7 +4178,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
                                                        op, dstCapacity,
                                                        ip, srcBytes,
                                                        lastBlockEntireSrc, 1 /* isPartition */);
-        DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+        DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size",
+                    ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
         FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
 
         ip += srcBytes;
@@ -3629,10 +4187,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac
         dstCapacity -= cSizeChunk;
         cSize += cSizeChunk;
         *currSeqStore = *nextSeqStore;
-        assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
+        assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize);
     }
-    /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes
-     * for the next block.
+    /* cRep and dRep may have diverged during the compression.
+     * If so, we use the dRep repcodes for the next block.
      */
     ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t));
     return cSize;
@@ -3643,8 +4201,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
                               void* dst, size_t dstCapacity,
                               const void* src, size_t srcSize, U32 lastBlock)
 {
-    const BYTE* ip = (const BYTE*)src;
-    BYTE* op = (BYTE*)dst;
     U32 nbSeq;
     size_t cSize;
     DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
@@ -3655,7 +4211,8 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
         if (bss == ZSTDbss_noCompress) {
             if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
                 zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
-            cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock);
+            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
+            cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
             FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
             DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
             return cSize;
@@ -3673,9 +4230,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
                             void* dst, size_t dstCapacity,
                             const void* src, size_t srcSize, U32 frame)
 {
-    /* This the upper bound for the length of an rle block.
-     * This isn't the actual upper bound. Finding the real threshold
-     * needs further investigation.
+    /* This is an estimated upper bound for the length of an rle block.
+     * This isn't the actual upper bound.
+     * Finding the real threshold needs further investigation.
      */
     const U32 rleMaxLength = 25;
     size_t cSize;
@@ -3687,11 +4244,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
 
     {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
         FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
-        if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; }
+        if (bss == ZSTDbss_noCompress) {
+            RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block");
+            cSize = 0;
+            goto out;
+        }
     }
 
     if (zc->seqCollector.collectSequences) {
-        ZSTD_copyBlockSequences(zc);
+        FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed");
         ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
         return 0;
     }
@@ -3767,10 +4328,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
          *   * cSize >= blockBound(srcSize): We have expanded the block too much so
          *     emit an uncompressed block.
          */
-        {
-            size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+        {   size_t const cSize =
+                ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
             if (cSize != ERROR(dstSize_tooSmall)) {
-                size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                size_t const maxCSize =
+                    srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
                 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
                 if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
                     ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
@@ -3778,7 +4340,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
                 }
             }
         }
-    }
+    } /* if (bss == ZSTDbss_compress)*/
 
     DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
     /* Superblock compression failed, attempt to emit a single no compress block.
@@ -3836,7 +4398,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
 *   All blocks will be terminated, all input will be consumed.
 *   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
 *   Frame is supposed already started (header already produced)
-*   @return : compressed size, or an error code
+*  @return : compressed size, or an error code
 */
 static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
                                      void* dst, size_t dstCapacity,
@@ -3860,7 +4422,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
         ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
         U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
 
-        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
+        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+         * additional 1. We need to revisit and change this logic to be more consistent */
+        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1,
                         dstSize_tooSmall,
                         "not enough space to store compressed block");
         if (remaining < blockSize) blockSize = remaining;
@@ -3899,7 +4463,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
                     MEM_writeLE24(op, cBlockHeader);
                     cSize += ZSTD_blockHeaderSize;
                 }
-            }
+            }  /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/
 
 
             ip += blockSize;
@@ -4001,19 +4565,15 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity)
     }
 }
 
-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
+void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
 {
-    RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong,
-                    "wrong cctx stage");
-    RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable,
-                    parameter_unsupported,
-                    "incompatible with ldm");
+    assert(cctx->stage == ZSTDcs_init);
+    assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable);
     cctx->externSeqStore.seq = seq;
     cctx->externSeqStore.size = nbSeq;
     cctx->externSeqStore.capacity = nbSeq;
     cctx->externSeqStore.pos = 0;
     cctx->externSeqStore.posInSequence = 0;
-    return 0;
 }
 
 
@@ -4078,31 +4638,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
     }
 }
 
-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
-                              void* dst, size_t dstCapacity,
-                        const void* src, size_t srcSize)
+size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
+                                        void* dst, size_t dstCapacity,
+                                  const void* src, size_t srcSize)
 {
     DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
 }
 
+/* NOTE: Must just wrap ZSTD_compressContinue_public() */
+size_t ZSTD_compressContinue(ZSTD_CCtx* cctx,
+                             void* dst, size_t dstCapacity,
+                       const void* src, size_t srcSize)
+{
+    return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize);
+}
 
-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx)
 {
     ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
     assert(!ZSTD_checkCParams(cParams));
-    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
+    return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog);
 }
 
-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */
+size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+{
+    return ZSTD_getBlockSize_deprecated(cctx);
+}
+
+/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
+size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
-    { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
+    { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx);
       RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
 
     return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
 }
 
+/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */
+size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize);
+}
+
 /*! ZSTD_loadDictionaryContent() :
  *  @return : 0, or an error code
  */
@@ -4111,25 +4691,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
                                          ZSTD_cwksp* ws,
                                          ZSTD_CCtx_params const* params,
                                          const void* src, size_t srcSize,
-                                         ZSTD_dictTableLoadMethod_e dtlm)
+                                         ZSTD_dictTableLoadMethod_e dtlm,
+                                         ZSTD_tableFillPurpose_e tfp)
 {
     const BYTE* ip = (const BYTE*) src;
     const BYTE* const iend = ip + srcSize;
     int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL;
 
-    /* Assert that we the ms params match the params we're being given */
+    /* Assert that the ms params match the params we're being given */
     ZSTD_assertEqualCParams(params->cParams, ms->cParams);
 
-    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
+    {   /* Ensure large dictionaries can't cause index overflow */
+
         /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX.
          * Dictionaries right at the edge will immediately trigger overflow
          * correction, but I don't want to insert extra constraints here.
          */
-        U32 const maxDictSize = ZSTD_CURRENT_MAX - 1;
-        /* We must have cleared our windows when our source is this large. */
-        assert(ZSTD_window_isEmpty(ms->window));
-        if (loadLdmDict)
-            assert(ZSTD_window_isEmpty(ls->window));
+        U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX;
+
+        int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(&params->cParams);
+        if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) {
+            /* Some dictionary matchfinders in zstd use "short cache",
+             * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each
+             * CDict hashtable entry as a tag rather than as part of an index.
+             * When short cache is used, we need to truncate the dictionary
+             * so that its indices don't overlap with the tag. */
+            U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX;
+            maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize);
+            assert(!loadLdmDict);
+        }
+
         /* If the dictionary is too large, only load the suffix of the dictionary. */
         if (srcSize > maxDictSize) {
             ip = iend - maxDictSize;
@@ -4138,35 +4729,58 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
         }
     }
 
-    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
+    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
+        /* We must have cleared our windows when our source is this large. */
+        assert(ZSTD_window_isEmpty(ms->window));
+        if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window));
+    }
     ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0);
-    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
-    ms->forceNonContiguous = params->deterministicRefPrefix;
 
-    if (loadLdmDict) {
+    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder);
+
+    if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */
         ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0);
         ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
+        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
     }
 
+    /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */
+    if (params->cParams.strategy < ZSTD_btultra) {
+        U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28);
+        if (srcSize > maxDictSize) {
+            ip = iend - maxDictSize;
+            src = ip;
+            srcSize = maxDictSize;
+        }
+    }
+
+    ms->nextToUpdate = (U32)(ip - ms->window.base);
+    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+    ms->forceNonContiguous = params->deterministicRefPrefix;
+
     if (srcSize <= HASH_READ_SIZE) return 0;
 
     ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
 
-    if (loadLdmDict)
-        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
-
     switch(params->cParams.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable(ms, iend, dtlm);
+        ZSTD_fillHashTable(ms, iend, dtlm, tfp);
         break;
     case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+        ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp);
+#else
+        assert(0); /* shouldn't be called: cparams should've been adjusted. */
+#endif
         break;
 
     case ZSTD_greedy:
     case ZSTD_lazy:
     case ZSTD_lazy2:
+#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR)
         assert(srcSize >= HASH_READ_SIZE);
         if (ms->dedicatedDictSearch) {
             assert(ms->chainTable != NULL);
@@ -4174,7 +4788,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
         } else {
             assert(params->useRowMatchFinder != ZSTD_ps_auto);
             if (params->useRowMatchFinder == ZSTD_ps_enable) {
-                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16);
+                size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog);
                 ZSTD_memset(ms->tagTable, 0, tagTableSize);
                 ZSTD_row_update(ms, iend-HASH_READ_SIZE);
                 DEBUGLOG(4, "Using row-based hash table for lazy dict");
@@ -4183,14 +4797,23 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
                 DEBUGLOG(4, "Using chain-based hash table for lazy dict");
             }
         }
+#else
+        assert(0); /* shouldn't be called: cparams should've been adjusted. */
+#endif
         break;
 
     case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
     case ZSTD_btopt:
     case ZSTD_btultra:
     case ZSTD_btultra2:
+#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
         assert(srcSize >= HASH_READ_SIZE);
         ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
+#else
+        assert(0); /* shouldn't be called: cparams should've been adjusted. */
+#endif
         break;
 
     default:
@@ -4237,11 +4860,10 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
 
         /* We only set the loaded table as valid if it contains all non-zero
          * weights. Otherwise, we set it to check */
-        if (!hasZeroWeights)
+        if (!hasZeroWeights && maxSymbolValue == 255)
             bs->entropy.huf.repeatMode = HUF_repeat_valid;
 
         RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, "");
-        RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, "");
         dictPtr += hufHeaderSize;
     }
 
@@ -4327,6 +4949,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
                                       ZSTD_CCtx_params const* params,
                                       const void* dict, size_t dictSize,
                                       ZSTD_dictTableLoadMethod_e dtlm,
+                                      ZSTD_tableFillPurpose_e tfp,
                                       void* workspace)
 {
     const BYTE* dictPtr = (const BYTE*)dict;
@@ -4345,7 +4968,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
     {
         size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
         FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
-            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
+            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), "");
     }
     return dictID;
 }
@@ -4361,6 +4984,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
                          const void* dict, size_t dictSize,
                                ZSTD_dictContentType_e dictContentType,
                                ZSTD_dictTableLoadMethod_e dtlm,
+                               ZSTD_tableFillPurpose_e tfp,
                                void* workspace)
 {
     DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
@@ -4373,13 +4997,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
 
     /* dict restricted modes */
     if (dictContentType == ZSTD_dct_rawContent)
-        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
+        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp);
 
     if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
         if (dictContentType == ZSTD_dct_auto) {
             DEBUGLOG(4, "raw content dictionary detected");
             return ZSTD_loadDictionaryContent(
-                ms, ls, ws, params, dict, dictSize, dtlm);
+                ms, ls, ws, params, dict, dictSize, dtlm, tfp);
         }
         RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
         assert(0);   /* impossible */
@@ -4387,13 +5011,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
 
     /* dict as full zstd dictionary */
     return ZSTD_loadZstdDictionary(
-        bs, ms, ws, params, dict, dictSize, dtlm, workspace);
+        bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace);
 }
 
 #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
 #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL)
 
 /*! ZSTD_compressBegin_internal() :
+ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both
  * @return : 0, or an error code */
 static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
                                     const void* dict, size_t dictSize,
@@ -4426,11 +5051,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
                         cdict->dictContentSize, cdict->dictContentType, dtlm,
-                        cctx->entropyWorkspace)
+                        ZSTD_tfp_forCCtx, cctx->entropyWorkspace)
               : ZSTD_compress_insertDictionary(
                         cctx->blockState.prevCBlock, &cctx->blockState.matchState,
                         &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
-                        dictContentType, dtlm, cctx->entropyWorkspace);
+                        dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace);
         FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
         assert(dictID <= UINT_MAX);
         cctx->dictID = (U32)dictID;
@@ -4471,11 +5096,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
                                             &cctxParams, pledgedSrcSize);
 }
 
-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+static size_t
+ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
 {
     ZSTD_CCtx_params cctxParams;
-    {
-        ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
+    {   ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict);
         ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel);
     }
     DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
@@ -4483,9 +5108,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di
                                        &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
 }
 
+size_t
+ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+{
+    return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel);
+}
+
 size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
 {
-    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
+    return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel);
 }
 
 
@@ -4496,14 +5127,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
 {
     BYTE* const ostart = (BYTE*)dst;
     BYTE* op = ostart;
-    size_t fhSize = 0;
 
     DEBUGLOG(4, "ZSTD_writeEpilogue");
     RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing");
 
     /* special case : empty frame */
     if (cctx->stage == ZSTDcs_init) {
-        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
+        size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
         FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
         dstCapacity -= fhSize;
         op += fhSize;
@@ -4513,8 +5143,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
     if (cctx->stage != ZSTDcs_ending) {
         /* write one last empty block, make it the "last" block */
         U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
-        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue");
-        MEM_writeLE32(op, cBlockHeader24);
+        ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3);
+        RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue");
+        MEM_writeLE24(op, cBlockHeader24);
         op += ZSTD_blockHeaderSize;
         dstCapacity -= ZSTD_blockHeaderSize;
     }
@@ -4537,9 +5168,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize)
     (void)extraCSize;
 }
 
-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
-                         void* dst, size_t dstCapacity,
-                   const void* src, size_t srcSize)
+size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize)
 {
     size_t endResult;
     size_t const cSize = ZSTD_compressContinue_internal(cctx,
@@ -4563,6 +5194,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
     return cSize + endResult;
 }
 
+/* NOTE: Must just wrap ZSTD_compressEnd_public() */
+size_t ZSTD_compressEnd(ZSTD_CCtx* cctx,
+                        void* dst, size_t dstCapacity,
+                  const void* src, size_t srcSize)
+{
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
+}
+
 size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
                                void* dst, size_t dstCapacity,
                          const void* src, size_t srcSize,
@@ -4591,7 +5230,7 @@ size_t ZSTD_compress_advanced_internal(
     FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
                          dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
                          params, srcSize, ZSTDb_not_buffered) , "");
-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
 }
 
 size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
@@ -4709,7 +5348,7 @@ static size_t ZSTD_initCDict_internal(
         {   size_t const dictID = ZSTD_compress_insertDictionary(
                     &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
                     &params, cdict->dictContent, cdict->dictContentSize,
-                    dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
+                    dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace);
             FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
             assert(dictID <= (size_t)(U32)-1);
             cdict->dictID = (U32)dictID;
@@ -4811,7 +5450,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2(
                         cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch,
                         customMem);
 
-    if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+    if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict,
                                     dict, dictSize,
                                     dictLoadMethod, dictContentType,
                                     cctxParams) )) {
@@ -4906,6 +5545,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
     params.cParams = cParams;
     params.useRowMatchFinder = useRowMatchFinder;
     cdict->useRowMatchFinder = useRowMatchFinder;
+    cdict->compressionLevel = ZSTD_NO_CLEVEL;
 
     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
                                               dict, dictSize,
@@ -4985,12 +5625,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
 
 /* ZSTD_compressBegin_usingCDict() :
  * cdict must be != NULL */
-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
 {
     ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
     return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
 }
 
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict);
+}
+
 /*! ZSTD_compress_usingCDict_internal():
  * Implementation of various ZSTD_compress_usingCDict* functions.
  */
@@ -5000,7 +5645,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
                                 const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
 {
     FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */
-    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+    return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize);
 }
 
 /*! ZSTD_compress_usingCDict_advanced():
@@ -5197,30 +5842,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
 
 static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
 {
-    size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
-    if (hintInSize==0) hintInSize = cctx->blockSize;
-    return hintInSize;
+    if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        return cctx->blockSize - cctx->stableIn_notConsumed;
+    }
+    assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered);
+    {   size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+        if (hintInSize==0) hintInSize = cctx->blockSize;
+        return hintInSize;
+    }
 }
 
 /* ZSTD_compressStream_generic():
  *  internal function for all *compressStream*() variants
- *  non-static, because can be called from zstdmt_compress.c
- * @return : hint size for next input */
+ * @return : hint size for next input to complete ongoing block */
 static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                                           ZSTD_outBuffer* output,
                                           ZSTD_inBuffer* input,
                                           ZSTD_EndDirective const flushMode)
 {
-    const char* const istart = (const char*)input->src;
-    const char* const iend = input->size != 0 ? istart + input->size : istart;
-    const char* ip = input->pos != 0 ? istart + input->pos : istart;
-    char* const ostart = (char*)output->dst;
-    char* const oend = output->size != 0 ? ostart + output->size : ostart;
-    char* op = output->pos != 0 ? ostart + output->pos : ostart;
+    const char* const istart = (assert(input != NULL), (const char*)input->src);
+    const char* const iend = (istart != NULL) ? istart + input->size : istart;
+    const char* ip = (istart != NULL) ? istart + input->pos : istart;
+    char* const ostart = (assert(output != NULL), (char*)output->dst);
+    char* const oend = (ostart != NULL) ? ostart + output->size : ostart;
+    char* op = (ostart != NULL) ? ostart + output->pos : ostart;
     U32 someMoreWork = 1;
 
     /* check expectations */
-    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
+    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos);
+    assert(zcs != NULL);
+    if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) {
+        assert(input->pos >= zcs->stableIn_notConsumed);
+        input->pos -= zcs->stableIn_notConsumed;
+        if (ip) ip -= zcs->stableIn_notConsumed;
+        zcs->stableIn_notConsumed = 0;
+    }
     if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) {
         assert(zcs->inBuff != NULL);
         assert(zcs->inBuffSize > 0);
@@ -5229,8 +5885,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
         assert(zcs->outBuff !=  NULL);
         assert(zcs->outBuffSize > 0);
     }
-    assert(output->pos <= output->size);
+    if (input->src == NULL) assert(input->size == 0);
     assert(input->pos <= input->size);
+    if (output->dst == NULL) assert(output->size == 0);
+    assert(output->pos <= output->size);
     assert((U32)flushMode <= (U32)ZSTD_e_end);
 
     while (someMoreWork) {
@@ -5245,7 +5903,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)  /* OR we are allowed to return dstSizeTooSmall */
               && (zcs->inBuffPos == 0) ) {
                 /* shortcut to compression pass directly into output buffer */
-                size_t const cSize = ZSTD_compressEnd(zcs,
+                size_t const cSize = ZSTD_compressEnd_public(zcs,
                                                 op, oend-op, ip, iend-ip);
                 DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
                 FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
@@ -5262,8 +5920,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                                         zcs->inBuff + zcs->inBuffPos, toLoad,
                                         ip, iend-ip);
                 zcs->inBuffPos += loaded;
-                if (loaded != 0)
-                    ip += loaded;
+                if (ip) ip += loaded;
                 if ( (flushMode == ZSTD_e_continue)
                   && (zcs->inBuffPos < zcs->inBuffTarget) ) {
                     /* not enough input to fill full block : stop here */
@@ -5274,6 +5931,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                     /* empty */
                     someMoreWork = 0; break;
                 }
+            } else {
+                assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
+                if ( (flushMode == ZSTD_e_continue)
+                  && ( (size_t)(iend - ip) < zcs->blockSize) ) {
+                    /* can't compress a full block : stop here */
+                    zcs->stableIn_notConsumed = (size_t)(iend - ip);
+                    ip = iend;  /* pretend to have consumed input */
+                    someMoreWork = 0; break;
+                }
+                if ( (flushMode == ZSTD_e_flush)
+                  && (ip == iend) ) {
+                    /* empty */
+                    someMoreWork = 0; break;
+                }
             }
             /* compress current block (note : this stage cannot be stopped in the middle) */
             DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
@@ -5281,9 +5952,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 void* cDst;
                 size_t cSize;
                 size_t oSize = oend-op;
-                size_t const iSize = inputBuffered
-                    ? zcs->inBuffPos - zcs->inToCompress
-                    : MIN((size_t)(iend - ip), zcs->blockSize);
+                size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress
+                                                   : MIN((size_t)(iend - ip), zcs->blockSize);
                 if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable)
                     cDst = op;   /* compress into output buffer, to skip flush stage */
                 else
@@ -5291,9 +5961,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                 if (inputBuffered) {
                     unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
                     cSize = lastBlock ?
-                            ZSTD_compressEnd(zcs, cDst, oSize,
+                            ZSTD_compressEnd_public(zcs, cDst, oSize,
                                         zcs->inBuff + zcs->inToCompress, iSize) :
-                            ZSTD_compressContinue(zcs, cDst, oSize,
+                            ZSTD_compressContinue_public(zcs, cDst, oSize,
                                         zcs->inBuff + zcs->inToCompress, iSize);
                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
                     zcs->frameEnded = lastBlock;
@@ -5306,19 +5976,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
                     if (!lastBlock)
                         assert(zcs->inBuffTarget <= zcs->inBuffSize);
                     zcs->inToCompress = zcs->inBuffPos;
-                } else {
-                    unsigned const lastBlock = (ip + iSize == iend);
-                    assert(flushMode == ZSTD_e_end /* Already validated */);
+                } else { /* !inputBuffered, hence ZSTD_bm_stable */
+                    unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend);
                     cSize = lastBlock ?
-                            ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) :
-                            ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize);
+                            ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) :
+                            ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize);
                     /* Consume the input prior to error checking to mirror buffered mode. */
-                    if (iSize > 0)
-                        ip += iSize;
+                    if (ip) ip += iSize;
                     FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
                     zcs->frameEnded = lastBlock;
-                    if (lastBlock)
-                        assert(ip == iend);
+                    if (lastBlock) assert(ip == iend);
                 }
                 if (cDst == op) {  /* no need to flush */
                     op += cSize;
@@ -5388,8 +6055,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf
 /* After a compression call set the expected input/output buffer.
  * This is validated at the start of the next compression call.
  */
-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input)
+static void
+ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input)
 {
+    DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)");
     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
         cctx->expectedInBuffer = *input;
     }
@@ -5408,22 +6077,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx,
 {
     if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) {
         ZSTD_inBuffer const expect = cctx->expectedInBuffer;
-        if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size)
-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!");
-        if (endOp != ZSTD_e_end)
-            RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!");
+        if (expect.src != input->src || expect.pos != input->pos)
+            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!");
     }
+    (void)endOp;
     if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) {
         size_t const outBufferSize = output->size - output->pos;
         if (cctx->expectedOutBufferSize != outBufferSize)
-            RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!");
+            RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!");
     }
     return 0;
 }
 
 static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
                                              ZSTD_EndDirective endOp,
-                                             size_t inSize) {
+                                             size_t inSize)
+{
     ZSTD_CCtx_params params = cctx->requestedParams;
     ZSTD_prefixDict const prefixDict = cctx->prefixDict;
     FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
@@ -5437,9 +6106,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
         params.compressionLevel = cctx->cdict->compressionLevel;
     }
     DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
-    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-fix pledgedSrcSize */
-    {
-        size_t const dictSize = prefixDict.dict
+    if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1;  /* auto-determine pledgedSrcSize */
+
+    {   size_t const dictSize = prefixDict.dict
                 ? prefixDict.dictSize
                 : (cctx->cdict ? cctx->cdict->dictContentSize : 0);
         ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, &params, cctx->pledgedSrcSizePlusOne - 1);
@@ -5451,6 +6120,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
     params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, &params.cParams);
     params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, &params.cParams);
     params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, &params.cParams);
+    params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences);
+    params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize);
+    params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel);
 
     {   U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1;
         assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
@@ -5477,6 +6149,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx,
     return 0;
 }
 
+/* @return provides a minimum amount of data remaining to be flushed from internal buffers
+ */
 size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
                              ZSTD_outBuffer* output,
                              ZSTD_inBuffer* input,
@@ -5491,8 +6165,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
 
     /* transparent initialization stage */
     if (cctx->streamStage == zcss_init) {
-        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed");
-        ZSTD_setBufferExpectations(cctx, output, input);    /* Set initial buffer expectations now that we've initialized */
+        size_t const inputSize = input->size - input->pos;  /* no obligation to start from pos==0 */
+        size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed;
+        if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */
+          && (endOp == ZSTD_e_continue)                             /* no flush requested, more input to come */
+          && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) {              /* not even reached one block yet */
+            if (cctx->stableIn_notConsumed) {  /* not the first time */
+                /* check stable source guarantees */
+                RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer");
+                RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos");
+            }
+            /* pretend input was consumed, to give a sense forward progress */
+            input->pos = input->size;
+            /* save stable inBuffer, for later control, and flush/end */
+            cctx->expectedInBuffer = *input;
+            /* but actually input wasn't consumed, so keep track of position from where compression shall resume */
+            cctx->stableIn_notConsumed += inputSize;
+            /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */
+            return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format);  /* at least some header to produce */
+        }
+        FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed");
+        ZSTD_setBufferExpectations(cctx, output, input);   /* Set initial buffer expectations now that we've initialized */
     }
     /* end of transparent initialization stage */
 
@@ -5510,13 +6203,20 @@ size_t ZSTD_compressStream2_simpleArgs (
                       const void* src, size_t srcSize, size_t* srcPos,
                             ZSTD_EndDirective endOp)
 {
-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+    ZSTD_outBuffer output;
+    ZSTD_inBuffer  input;
+    output.dst = dst;
+    output.size = dstCapacity;
+    output.pos = *dstPos;
+    input.src = src;
+    input.size = srcSize;
+    input.pos = *srcPos;
     /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
-    size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
-    *dstPos = output.pos;
-    *srcPos = input.pos;
-    return cErr;
+    {   size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+        *dstPos = output.pos;
+        *srcPos = input.pos;
+        return cErr;
+    }
 }
 
 size_t ZSTD_compress2(ZSTD_CCtx* cctx,
@@ -5539,6 +6239,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
         /* Reset to the original values. */
         cctx->requestedParams.inBufferMode = originalInBufferMode;
         cctx->requestedParams.outBufferMode = originalOutBufferMode;
+
         FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
         if (result != 0) {  /* compression not completed, due to lack of output space */
             assert(oPos == dstCapacity);
@@ -5549,64 +6250,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx,
     }
 }
 
-typedef struct {
-    U32 idx;             /* Index in array of ZSTD_Sequence */
-    U32 posInSequence;   /* Position within sequence at idx */
-    size_t posInSrc;        /* Number of bytes given by sequences provided so far */
-} ZSTD_sequencePosition;
-
 /* ZSTD_validateSequence() :
  * @offCode : is presumed to follow format required by ZSTD_storeSeq()
  * @returns a ZSTD error code if sequence is not valid
  */
 static size_t
-ZSTD_validateSequence(U32 offCode, U32 matchLength,
-                      size_t posInSrc, U32 windowLog, size_t dictSize)
+ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch,
+                      size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer)
 {
-    U32 const windowSize = 1 << windowLog;
+    U32 const windowSize = 1u << windowLog;
     /* posInSrc represents the amount of data the decoder would decode up to this point.
      * As long as the amount of data decoded is less than or equal to window size, offsets may be
      * larger than the total length of output decoded in order to reference the dict, even larger than
      * window size. After output surpasses windowSize, we're limited to windowSize offsets again.
      */
     size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize;
-    RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!");
-    RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small");
+    size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4;
+    RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!");
+    /* Validate maxNbSeq is large enough for the given matchLength and minMatch */
+    RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch");
     return 0;
 }
 
 /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */
-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
+static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0)
 {
-    U32 offCode = STORE_OFFSET(rawOffset);
+    U32 offBase = OFFSET_TO_OFFBASE(rawOffset);
 
     if (!ll0 && rawOffset == rep[0]) {
-        offCode = STORE_REPCODE_1;
+        offBase = REPCODE1_TO_OFFBASE;
     } else if (rawOffset == rep[1]) {
-        offCode = STORE_REPCODE(2 - ll0);
+        offBase = REPCODE_TO_OFFBASE(2 - ll0);
     } else if (rawOffset == rep[2]) {
-        offCode = STORE_REPCODE(3 - ll0);
+        offBase = REPCODE_TO_OFFBASE(3 - ll0);
     } else if (ll0 && rawOffset == rep[0] - 1) {
-        offCode = STORE_REPCODE_3;
+        offBase = REPCODE3_TO_OFFBASE;
     }
-    return offCode;
+    return offBase;
 }
 
-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
- */
-static size_t
+size_t
 ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
                                               ZSTD_sequencePosition* seqPos,
                                         const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                        const void* src, size_t blockSize)
+                                        const void* src, size_t blockSize,
+                                        ZSTD_paramSwitch_e externalRepSearch)
 {
     U32 idx = seqPos->idx;
+    U32 const startIdx = idx;
     BYTE const* ip = (BYTE const*)(src);
     const BYTE* const iend = ip + blockSize;
     repcodes_t updatedRepcodes;
     U32 dictSize;
 
+    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize);
+
     if (cctx->cdict) {
         dictSize = (U32)cctx->cdict->dictContentSize;
     } else if (cctx->prefixDict.dict) {
@@ -5615,25 +6313,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
         dictSize = 0;
     }
     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
-    for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) {
+    for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) {
         U32 const litLength = inSeqs[idx].litLength;
-        U32 const ll0 = (litLength == 0);
         U32 const matchLength = inSeqs[idx].matchLength;
-        U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
-        ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
+        U32 offBase;
+
+        if (externalRepSearch == ZSTD_ps_disable) {
+            offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset);
+        } else {
+            U32 const ll0 = (litLength == 0);
+            offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0);
+            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
+        }
 
-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
+        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
         if (cctx->appliedParams.validateSequences) {
             seqPos->posInSrc += litLength + matchLength;
-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
-                                                cctx->appliedParams.cParams.windowLog, dictSize),
+            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
+                                                cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
                                                 "Sequence validation failed");
         }
-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
+        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
+        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
         ip += matchLength + litLength;
     }
+
+    /* If we skipped repcode search while parsing, we need to update repcodes now */
+    assert(externalRepSearch != ZSTD_ps_auto);
+    assert(idx >= startIdx);
+    if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) {
+        U32* const rep = updatedRepcodes.rep;
+        U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */
+
+        if (lastSeqIdx >= startIdx + 2) {
+            rep[2] = inSeqs[lastSeqIdx - 2].offset;
+            rep[1] = inSeqs[lastSeqIdx - 1].offset;
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        } else if (lastSeqIdx == startIdx + 1) {
+            rep[2] = rep[0];
+            rep[1] = inSeqs[lastSeqIdx - 1].offset;
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        } else {
+            assert(lastSeqIdx == startIdx);
+            rep[2] = rep[1];
+            rep[1] = rep[0];
+            rep[0] = inSeqs[lastSeqIdx].offset;
+        }
+    }
+
     ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t));
 
     if (inSeqs[idx].litLength) {
@@ -5642,26 +6370,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
         ip += inSeqs[idx].litLength;
         seqPos->posInSrc += inSeqs[idx].litLength;
     }
-    RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!");
+    RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!");
     seqPos->idx = idx+1;
     return 0;
 }
 
-/* Returns the number of bytes to move the current read position back by. Only non-zero
- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something
- * went wrong.
- *
- * This function will attempt to scan through blockSize bytes represented by the sequences
- * in inSeqs, storing any (partial) sequences.
- *
- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
- * avoid splitting a match, or to avoid splitting a match such that it would produce a match
- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
- */
-static size_t
+size_t
 ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
                                    const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                   const void* src, size_t blockSize)
+                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch)
 {
     U32 idx = seqPos->idx;
     U32 startPosInSequence = seqPos->posInSequence;
@@ -5673,6 +6390,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
     U32 bytesAdjustment = 0;
     U32 finalMatchSplit = 0;
 
+    /* TODO(embg) support fast parsing mode in noBlockDelim mode */
+    (void)externalRepSearch;
+
     if (cctx->cdict) {
         dictSize = cctx->cdict->dictContentSize;
     } else if (cctx->prefixDict.dict) {
@@ -5680,7 +6400,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
     } else {
         dictSize = 0;
     }
-    DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
+    DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize);
     DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t));
     while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
@@ -5688,7 +6408,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
         U32 litLength = currSeq.litLength;
         U32 matchLength = currSeq.matchLength;
         U32 const rawOffset = currSeq.offset;
-        U32 offCode;
+        U32 offBase;
 
         /* Modify the sequence depending on where endPosInSequence lies */
         if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) {
@@ -5702,7 +6422,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
             /* Move to the next sequence */
             endPosInSequence -= currSeq.litLength + currSeq.matchLength;
             startPosInSequence = 0;
-            idx++;
         } else {
             /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence
                does not reach the end of the match. So, we have to split the sequence */
@@ -5742,21 +6461,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
         }
         /* Check if this offset can be represented with a repcode */
         {   U32 const ll0 = (litLength == 0);
-            offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0);
-            ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
+            offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0);
+            ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0);
         }
 
         if (cctx->appliedParams.validateSequences) {
             seqPos->posInSrc += litLength + matchLength;
-            FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc,
-                                                   cctx->appliedParams.cParams.windowLog, dictSize),
+            FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc,
+                                                   cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)),
                                                    "Sequence validation failed");
         }
-        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength);
-        RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation,
+        DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength);
+        RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid,
                         "Not enough memory allocated. Try adjusting ZSTD_c_minMatch.");
-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength);
+        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength);
         ip += matchLength + litLength;
+        if (!finalMatchSplit)
+            idx++; /* Next Sequence */
     }
     DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
     assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength);
@@ -5779,7 +6500,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition*
 
 typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
-                                       const void* src, size_t blockSize);
+                                       const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
 static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
 {
     ZSTD_sequenceCopier sequenceCopier = NULL;
@@ -5793,6 +6514,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode)
     return sequenceCopier;
 }
 
+/* Discover the size of next block by searching for the delimiter.
+ * Note that a block delimiter **must** exist in this mode,
+ * otherwise it's an input error.
+ * The block size retrieved will be later compared to ensure it remains within bounds */
+static size_t
+blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
+{
+    int end = 0;
+    size_t blockSize = 0;
+    size_t spos = seqPos.idx;
+    DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize);
+    assert(spos <= inSeqsSize);
+    while (spos < inSeqsSize) {
+        end = (inSeqs[spos].offset == 0);
+        blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength;
+        if (end) {
+            if (inSeqs[spos].matchLength != 0)
+                RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0");
+            break;
+        }
+        spos++;
+    }
+    if (!end)
+        RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter");
+    return blockSize;
+}
+
+/* More a "target" block size */
+static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining)
+{
+    int const lastBlock = (remaining <= blockSize);
+    return lastBlock ? remaining : blockSize;
+}
+
+static size_t determine_blockSize(ZSTD_sequenceFormat_e mode,
+                           size_t blockSize, size_t remaining,
+                     const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos)
+{
+    DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining);
+    if (mode == ZSTD_sf_noBlockDelimiters)
+        return blockSize_noDelimiter(blockSize, remaining);
+    {   size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos);
+        FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters");
+        if (explicitBlockSize > blockSize)
+            RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block");
+        if (explicitBlockSize > remaining)
+            RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source");
+        return explicitBlockSize;
+    }
+}
+
 /* Compress, block-by-block, all of the sequences given.
  *
  * Returns the cumulative size of all compressed blocks (including their headers),
@@ -5805,9 +6577,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
                           const void* src, size_t srcSize)
 {
     size_t cSize = 0;
-    U32 lastBlock;
-    size_t blockSize;
-    size_t compressedSeqsSize;
     size_t remaining = srcSize;
     ZSTD_sequencePosition seqPos = {0, 0, 0};
 
@@ -5827,22 +6596,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
     }
 
     while (remaining) {
+        size_t compressedSeqsSize;
         size_t cBlockSize;
         size_t additionalByteAdjustment;
-        lastBlock = remaining <= cctx->blockSize;
-        blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize;
+        size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters,
+                                        cctx->blockSize, remaining,
+                                        inSeqs, inSeqsSize, seqPos);
+        U32 const lastBlock = (blockSize == remaining);
+        FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size");
+        assert(blockSize <= remaining);
         ZSTD_resetSeqStore(&cctx->seqStore);
-        DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize);
+        DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize);
 
-        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize);
+        additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes);
         FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy");
         blockSize -= additionalByteAdjustment;
 
         /* If blocks are too small, emit as a nocompress block */
-        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
+        /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding
+         * additional 1. We need to revisit and change this logic to be more consistent */
+        if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) {
             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
             FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
-            DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
+            DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize);
             cSize += cBlockSize;
             ip += blockSize;
             op += cBlockSize;
@@ -5851,6 +6627,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
             continue;
         }
 
+        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block");
         compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore,
                                 &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy,
                                 &cctx->appliedParams,
@@ -5859,11 +6636,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
                                 cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
                                 cctx->bmi2);
         FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed");
-        DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize);
+        DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize);
 
         if (!cctx->isFirstBlock &&
             ZSTD_maybeRLE(&cctx->seqStore) &&
-            ZSTD_isRLE((BYTE const*)src, srcSize)) {
+            ZSTD_isRLE(ip, blockSize)) {
             /* We don't want to emit our first block as a RLE even if it qualifies because
             * doing so will cause the decoder (cli only) to throw a "should consume all input error."
             * This is only an issue for zstd <= v1.4.3
@@ -5874,12 +6651,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
         if (compressedSeqsSize == 0) {
             /* ZSTD_noCompressBlock writes the block header as well */
             cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
-            FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed");
-            DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize);
+            FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed");
+            DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize);
         } else if (compressedSeqsSize == 1) {
             cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock);
-            FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed");
-            DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize);
+            FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed");
+            DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize);
         } else {
             U32 cBlockHeader;
             /* Error checking and repcodes update */
@@ -5891,11 +6668,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
             cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3);
             MEM_writeLE24(op, cBlockHeader);
             cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize;
-            DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize);
+            DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize);
         }
 
         cSize += cBlockSize;
-        DEBUGLOG(4, "cSize running total: %zu", cSize);
 
         if (lastBlock) {
             break;
@@ -5906,12 +6682,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
             dstCapacity -= cBlockSize;
             cctx->isFirstBlock = 0;
         }
+        DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity);
     }
 
+    DEBUGLOG(4, "cSize final total: %zu", cSize);
     return cSize;
 }
 
-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity,
+size_t ZSTD_compressSequences(ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
                               const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
                               const void* src, size_t srcSize)
 {
@@ -5921,7 +6700,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
     size_t frameHeaderSize = 0;
 
     /* Transparent initialization stage, same as compressStream2() */
-    DEBUGLOG(3, "ZSTD_compressSequences()");
+    DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity);
     assert(cctx != NULL);
     FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed");
     /* Begin writing output, starting with frame header */
@@ -5949,26 +6728,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci
         cSize += 4;
     }
 
-    DEBUGLOG(3, "Final compressed size: %zu", cSize);
+    DEBUGLOG(4, "Final compressed size: %zu", cSize);
     return cSize;
 }
 
 /*======   Finalize   ======*/
 
+static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs)
+{
+    const ZSTD_inBuffer nullInput = { NULL, 0, 0 };
+    const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable);
+    return stableInput ? zcs->expectedInBuffer : nullInput;
+}
+
 /*! ZSTD_flushStream() :
  * @return : amount of data remaining to flush */
 size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
 {
-    ZSTD_inBuffer input = { NULL, 0, 0 };
+    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
+    input.size = input.pos; /* do not ingest more input during flush */
     return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
 }
 
 
 size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
 {
-    ZSTD_inBuffer input = { NULL, 0, 0 };
+    ZSTD_inBuffer input = inBuffer_forEndFlush(zcs);
     size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
-    FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
+    FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed");
     if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
     /* single thread mode : attempt to calculate remaining to flush more precisely */
     {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
@@ -6090,7 +6877,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel,
             cp.targetLength = (unsigned)(-clampedCompressionLevel);
         }
         /* refine parameters based on srcSize & dictSize */
-        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode);
+        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto);
     }
 }
 
@@ -6125,3 +6912,29 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH
     if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
     return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown);
 }
+
+void ZSTD_registerSequenceProducer(
+    ZSTD_CCtx* zc,
+    void* extSeqProdState,
+    ZSTD_sequenceProducer_F extSeqProdFunc
+) {
+    assert(zc != NULL);
+    ZSTD_CCtxParams_registerSequenceProducer(
+        &zc->requestedParams, extSeqProdState, extSeqProdFunc
+    );
+}
+
+void ZSTD_CCtxParams_registerSequenceProducer(
+  ZSTD_CCtx_params* params,
+  void* extSeqProdState,
+  ZSTD_sequenceProducer_F extSeqProdFunc
+) {
+    assert(params != NULL);
+    if (extSeqProdFunc != NULL) {
+        params->extSeqProdFunc = extSeqProdFunc;
+        params->extSeqProdState = extSeqProdState;
+    } else {
+        params->extSeqProdFunc = NULL;
+        params->extSeqProdState = NULL;
+    }
+}
diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
index 71697a11ae30..53cb582a8d2b 100644
--- a/lib/zstd/compress/zstd_compress_internal.h
+++ b/lib/zstd/compress/zstd_compress_internal.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -20,6 +21,7 @@
 ***************************************/
 #include "../common/zstd_internal.h"
 #include "zstd_cwksp.h"
+#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */
 
 
 /*-*************************************
@@ -32,7 +34,7 @@
                                        It's not a big deal though : candidate will just be sorted again.
                                        Additionally, candidate position 1 will be lost.
                                        But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
-                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy.
+                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy.
                                        This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
 
 
@@ -111,12 +113,13 @@ typedef struct {
 /* ZSTD_buildBlockEntropyStats() :
  *  Builds entropy for the block.
  *  @return : 0 on success or error code */
-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
-                             const ZSTD_entropyCTables_t* prevEntropy,
-                                   ZSTD_entropyCTables_t* nextEntropy,
-                             const ZSTD_CCtx_params* cctxParams,
-                                   ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                                   void* workspace, size_t wkspSize);
+size_t ZSTD_buildBlockEntropyStats(
+                    const seqStore_t* seqStorePtr,
+                    const ZSTD_entropyCTables_t* prevEntropy,
+                          ZSTD_entropyCTables_t* nextEntropy,
+                    const ZSTD_CCtx_params* cctxParams,
+                          ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                          void* workspace, size_t wkspSize);
 
 /* *******************************
 *  Compression internals structs *
@@ -142,26 +145,33 @@ typedef struct {
   size_t capacity;      /* The capacity starting from `seq` pointer */
 } rawSeqStore_t;
 
+typedef struct {
+    U32 idx;            /* Index in array of ZSTD_Sequence */
+    U32 posInSequence;  /* Position within sequence at idx */
+    size_t posInSrc;    /* Number of bytes given by sequences provided so far */
+} ZSTD_sequencePosition;
+
 UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0};
 
 typedef struct {
-    int price;
-    U32 off;
-    U32 mlen;
-    U32 litlen;
-    U32 rep[ZSTD_REP_NUM];
+    int price;  /* price from beginning of segment to this position */
+    U32 off;    /* offset of previous match */
+    U32 mlen;   /* length of previous match */
+    U32 litlen; /* nb of literals since previous match */
+    U32 rep[ZSTD_REP_NUM];  /* offset history after previous match */
 } ZSTD_optimal_t;
 
 typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
 
+#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3)
 typedef struct {
     /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
     unsigned* litFreq;           /* table of literals statistics, of size 256 */
     unsigned* litLengthFreq;     /* table of litLength statistics, of size (MaxLL+1) */
     unsigned* matchLengthFreq;   /* table of matchLength statistics, of size (MaxML+1) */
     unsigned* offCodeFreq;       /* table of offCode statistics, of size (MaxOff+1) */
-    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_NUM+1 */
-    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
+    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_SIZE */
+    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */
 
     U32  litSum;                 /* nb of literals */
     U32  litLengthSum;           /* nb of litLength codes */
@@ -212,8 +222,10 @@ struct ZSTD_matchState_t {
     U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
 
     U32 rowHashLog;                          /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/
-    U16* tagTable;                           /* For row-based matchFinder: A row-based table containing the hashes and head index. */
+    BYTE* tagTable;                          /* For row-based matchFinder: A row-based table containing the hashes and head index. */
     U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */
+    U64 hashSalt;                            /* For row-based matchFinder: salts the hash for reuse of tag table */
+    U32 hashSaltEntropy;                     /* For row-based matchFinder: collects entropy for salt generation */
 
     U32* hashTable;
     U32* hashTable3;
@@ -228,6 +240,18 @@ struct ZSTD_matchState_t {
     const ZSTD_matchState_t* dictMatchState;
     ZSTD_compressionParameters cParams;
     const rawSeqStore_t* ldmSeqStore;
+
+    /* Controls prefetching in some dictMatchState matchfinders.
+     * This behavior is controlled from the cctx ms.
+     * This parameter has no effect in the cdict ms. */
+    int prefetchCDictTables;
+
+    /* When == 0, lazy match finders insert every position.
+     * When != 0, lazy match finders only insert positions they search.
+     * This allows them to skip much faster over incompressible data,
+     * at a small cost to compression ratio.
+     */
+    int lazySkipping;
 };
 
 typedef struct {
@@ -324,6 +348,25 @@ struct ZSTD_CCtx_params_s {
 
     /* Internal use, for createCCtxParams() and freeCCtxParams() only */
     ZSTD_customMem customMem;
+
+    /* Controls prefetching in some dictMatchState matchfinders */
+    ZSTD_paramSwitch_e prefetchCDictTables;
+
+    /* Controls whether zstd will fall back to an internal matchfinder
+     * if the external matchfinder returns an error code. */
+    int enableMatchFinderFallback;
+
+    /* Parameters for the external sequence producer API.
+     * Users set these parameters through ZSTD_registerSequenceProducer().
+     * It is not possible to set these parameters individually through the public API. */
+    void* extSeqProdState;
+    ZSTD_sequenceProducer_F extSeqProdFunc;
+
+    /* Adjust the max block size*/
+    size_t maxBlockSize;
+
+    /* Controls repcode search in external sequence parsing */
+    ZSTD_paramSwitch_e searchForExternalRepcodes;
 };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
 
 #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2))
@@ -404,6 +447,7 @@ struct ZSTD_CCtx_s {
 
     /* Stable in/out buffer verification */
     ZSTD_inBuffer expectedInBuffer;
+    size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */
     size_t expectedOutBufferSize;
 
     /* Dictionary */
@@ -417,9 +461,14 @@ struct ZSTD_CCtx_s {
 
     /* Workspace for block splitter */
     ZSTD_blockSplitCtx blockSplitCtx;
+
+    /* Buffer for output from external sequence producer */
+    ZSTD_Sequence* extSeqBuf;
+    size_t extSeqBufCapacity;
 };
 
 typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
+typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e;
 
 typedef enum {
     ZSTD_noDict = 0,
@@ -441,7 +490,7 @@ typedef enum {
                                  * In this mode we take both the source size and the dictionary size
                                  * into account when selecting and adjusting the parameters.
                                  */
-    ZSTD_cpm_unknown = 3,       /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
+    ZSTD_cpm_unknown = 3        /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams.
                                  * We don't know what these parameters are for. We default to the legacy
                                  * behavior of taking both the source size and the dict size into account
                                  * when selecting and adjusting parameters.
@@ -500,9 +549,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
 /* ZSTD_noCompressBlock() :
  * Writes uncompressed block to dst buffer from given src.
  * Returns the size of the block */
-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+MEM_STATIC size_t
+ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
 {
     U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
+    DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity);
     RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
                     dstSize_tooSmall, "dst buf too small for uncompressed block");
     MEM_writeLE24(dst, cBlockHeader24);
@@ -510,7 +561,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi
     return ZSTD_blockHeaderSize + srcSize;
 }
 
-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+MEM_STATIC size_t
+ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
 {
     BYTE* const op = (BYTE*)dst;
     U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
@@ -529,7 +581,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
 {
     U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
     ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
-    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
+    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat));
     return (srcSize >> minlog) + 2;
 }
 
@@ -565,29 +617,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con
     while (ip < iend) *op++ = *ip++;
 }
 
-#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
-#define STORE_REPCODE_1 STORE_REPCODE(1)
-#define STORE_REPCODE_2 STORE_REPCODE(2)
-#define STORE_REPCODE_3 STORE_REPCODE(3)
-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1)
-#define STORE_OFFSET(o)  (assert((o)>0), o + ZSTD_REP_MOVE)
-#define STORED_IS_OFFSET(o)  ((o) > ZSTD_REP_MOVE)
-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE)
-#define STORED_OFFSET(o)  (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1)  /* returns ID 1,2,3 */
-#define STORED_TO_OFFBASE(o) ((o)+1)
-#define OFFBASE_TO_STORED(o) ((o)-1)
+
+#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1)
+#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2)
+#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3)
+#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */
+#define OFFSET_TO_OFFBASE(o)  (assert((o)>0), o + ZSTD_REP_NUM)
+#define OFFBASE_IS_OFFSET(o)  ((o) > ZSTD_REP_NUM)
+#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM)
+#define OFFBASE_TO_OFFSET(o)  (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM)
+#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o))  /* returns ID 1,2,3 */
 
 /*! ZSTD_storeSeq() :
- *  Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t.
- *  @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET().
+ *  Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t.
+ *  @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE().
  *  @matchLength : must be >= MINMATCH
- *  Allowed to overread literals up to litLimit.
+ *  Allowed to over-read literals up to litLimit.
 */
 HINT_INLINE UNUSED_ATTR void
 ZSTD_storeSeq(seqStore_t* seqStorePtr,
               size_t litLength, const BYTE* literals, const BYTE* litLimit,
-              U32 offBase_minus1,
+              U32 offBase,
               size_t matchLength)
 {
     BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
@@ -596,8 +646,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
     static const BYTE* g_start = NULL;
     if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
     {   U32 const pos = (U32)((const BYTE*)literals - g_start);
-        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
-               pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
+        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u",
+               pos, (U32)litLength, (U32)matchLength, (U32)offBase);
     }
 #endif
     assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
@@ -607,9 +657,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
     assert(literals + litLength <= litLimit);
     if (litEnd <= litLimit_w) {
         /* Common case we can use wildcopy.
-	 * First copy 16 bytes, because literals are likely short.
-	 */
-        assert(WILDCOPY_OVERLENGTH >= 16);
+         * First copy 16 bytes, because literals are likely short.
+         */
+        ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16);
         ZSTD_copy16(seqStorePtr->lit, literals);
         if (litLength > 16) {
             ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
@@ -628,7 +678,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
     seqStorePtr->sequences[0].litLength = (U16)litLength;
 
     /* match offset */
-    seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1);
+    seqStorePtr->sequences[0].offBase = offBase;
 
     /* match Length */
     assert(matchLength >= MINMATCH);
@@ -646,17 +696,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr,
 
 /* ZSTD_updateRep() :
  * updates in-place @rep (array of repeat offsets)
- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq()
+ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq()
  */
 MEM_STATIC void
-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
+ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
 {
-    if (STORED_IS_OFFSET(offBase_minus1)) {  /* full offset */
+    if (OFFBASE_IS_OFFSET(offBase)) {  /* full offset */
         rep[2] = rep[1];
         rep[1] = rep[0];
-        rep[0] = STORED_OFFSET(offBase_minus1);
+        rep[0] = OFFBASE_TO_OFFSET(offBase);
     } else {   /* repcode */
-        U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0;
+        U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0;
         if (repCode > 0) {  /* note : if repCode==0, no change */
             U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
             rep[2] = (repCode >= 2) ? rep[1] : rep[2];
@@ -673,11 +723,11 @@ typedef struct repcodes_s {
 } repcodes_t;
 
 MEM_STATIC repcodes_t
-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0)
+ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0)
 {
     repcodes_t newReps;
     ZSTD_memcpy(&newReps, rep, sizeof(newReps));
-    ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
+    ZSTD_updateRep(newReps.rep, offBase, ll0);
     return newReps;
 }
 
@@ -685,59 +735,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0
 /*-*************************************
 *  Match length counter
 ***************************************/
-static unsigned ZSTD_NbCommonBytes (size_t val)
-{
-    if (MEM_isLittleEndian()) {
-        if (MEM_64bits()) {
-#       if (__GNUC__ >= 4)
-            return (__builtin_ctzll((U64)val) >> 3);
-#       else
-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
-                                                     0, 3, 1, 3, 1, 4, 2, 7,
-                                                     0, 2, 3, 6, 1, 5, 3, 5,
-                                                     1, 3, 4, 4, 2, 5, 6, 7,
-                                                     7, 0, 1, 2, 3, 3, 4, 6,
-                                                     2, 6, 5, 5, 3, 4, 5, 6,
-                                                     7, 1, 2, 4, 6, 4, 4, 5,
-                                                     7, 2, 6, 5, 7, 6, 7, 7 };
-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
-#       endif
-        } else { /* 32 bits */
-#       if (__GNUC__ >= 3)
-            return (__builtin_ctz((U32)val) >> 3);
-#       else
-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
-                                                     3, 2, 2, 1, 3, 2, 0, 1,
-                                                     3, 3, 1, 2, 2, 2, 2, 0,
-                                                     3, 1, 2, 0, 1, 0, 1, 1 };
-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
-#       endif
-        }
-    } else {  /* Big Endian CPU */
-        if (MEM_64bits()) {
-#       if (__GNUC__ >= 4)
-            return (__builtin_clzll(val) >> 3);
-#       else
-            unsigned r;
-            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
-            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
-            r += (!val);
-            return r;
-#       endif
-        } else { /* 32 bits */
-#       if (__GNUC__ >= 3)
-            return (__builtin_clz((U32)val) >> 3);
-#       else
-            unsigned r;
-            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
-            r += (!val);
-            return r;
-#       endif
-    }   }
-}
-
-
 MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
 {
     const BYTE* const pStart = pIn;
@@ -783,32 +780,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
  *  Hashes
  ***************************************/
 static const U32 prime3bytes = 506832829U;
-static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
+static U32    ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s)  >> (32-h) ; }
+MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */
+MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); }
 
 static const U32 prime4bytes = 2654435761U;
-static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
+static U32    ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; }
+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); }
+static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); }
 
 static const U64 prime5bytes = 889523592379ULL;
-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
+static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); }
 
 static const U64 prime6bytes = 227718039650203ULL;
-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); }
 
 static const U64 prime7bytes = 58295818150454627ULL;
-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
+static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u  << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); }
 
 static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes)  ^ s) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); }
+static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); }
+
 
 MEM_STATIC FORCE_INLINE_ATTR
 size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
 {
+    /* Although some of these hashes do support hBits up to 64, some do not.
+     * To be on the safe side, always avoid hBits > 32. */
+    assert(hBits <= 32);
+
     switch(mls)
     {
     default:
@@ -820,6 +828,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
     }
 }
 
+MEM_STATIC FORCE_INLINE_ATTR
+size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) {
+    /* Although some of these hashes do support hBits up to 64, some do not.
+     * To be on the safe side, always avoid hBits > 32. */
+    assert(hBits <= 32);
+
+    switch(mls)
+    {
+        default:
+        case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt);
+        case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt);
+        case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt);
+        case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt);
+        case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt);
+    }
+}
+
+
 /* ZSTD_ipow() :
  * Return base^exponent.
  */
@@ -1011,7 +1037,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
  * The least significant cycleLog bits of the indices must remain the same,
  * which may be 0. Every index up to maxDist in the past must be valid.
  */
-MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
+MEM_STATIC
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
                                            U32 maxDist, void const* src)
 {
     /* preemptive overflow correction:
@@ -1167,10 +1195,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
                     (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
         assert(blockEndIdx >= loadedDictEnd);
 
-        if (blockEndIdx > loadedDictEnd + maxDist) {
+        if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) {
             /* On reaching window size, dictionaries are invalidated.
              * For simplification, if window size is reached anywhere within next block,
              * the dictionary is invalidated for the full block.
+             *
+             * We also have to invalidate the dictionary if ZSTD_window_update() has detected
+             * non-contiguous segments, which means that loadedDictEnd != window->dictLimit.
+             * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use
+             * dictMatchState, so setting it to NULL is not a problem.
              */
             DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
             *loadedDictEndPtr = 0;
@@ -1199,7 +1232,9 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
  * forget about the extDict. Handles overlap of the prefix and extDict.
  * Returns non-zero if the segment is contiguous.
  */
-MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
+MEM_STATIC
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_window_update(ZSTD_window_t* window,
                                   void const* src, size_t srcSize,
                                   int forceNonContiguous)
 {
@@ -1302,6 +1337,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
 
 #endif
 
+/* Short Cache */
+
+/* Normally, zstd matchfinders follow this flow:
+ *     1. Compute hash at ip
+ *     2. Load index from hashTable[hash]
+ *     3. Check if *ip == *(base + index)
+ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss.
+ *
+ * Short cache is an optimization which allows us to avoid step 3 most of the time
+ * when the data doesn't actually match. With short cache, the flow becomes:
+ *     1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip.
+ *     2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works.
+ *     3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue.
+ *
+ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to
+ * dictMatchState matchfinders.
+ */
+#define ZSTD_SHORT_CACHE_TAG_BITS 8
+#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1)
+
+/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable.
+ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */
+MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) {
+    size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
+    U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK);
+    assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0);
+    hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag;
+}
+
+/* Helper function for short cache matchfinders.
+ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */
+MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) {
+    U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK;
+    U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK;
+    return tag1 == tag2;
+}
 
 
 /* ===============================================================
@@ -1381,11 +1452,10 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
  * This cannot be used when long range matching is enabled.
  * Zstd will use these sequences, and pass the literals to a secondary block
  * compressor.
- * @return : An error code on failure.
  * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
  * access and data corruption.
  */
-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
+void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
 
 /* ZSTD_cycleLog() :
  *  condition for correct operation : hashLog > 1 */
@@ -1396,4 +1466,55 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
  */
 void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize);
 
+/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of
+ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter.
+ * Note that the block delimiter must include the last literals of the block.
+ */
+size_t
+ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+                                              ZSTD_sequencePosition* seqPos,
+                                        const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                        const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+
+/* Returns the number of bytes to move the current read position back by.
+ * Only non-zero if we ended up splitting a sequence.
+ * Otherwise, it may return a ZSTD error if something went wrong.
+ *
+ * This function will attempt to scan through blockSize bytes
+ * represented by the sequences in @inSeqs,
+ * storing any (partial) sequences.
+ *
+ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to
+ * avoid splitting a match, or to avoid splitting a match such that it would produce a match
+ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block.
+ */
+size_t
+ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos,
+                                   const ZSTD_Sequence* const inSeqs, size_t inSeqsSize,
+                                   const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch);
+
+/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */
+MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) {
+    return params->extSeqProdFunc != NULL;
+}
+
+/* ===============================================================
+ * Deprecated definitions that are still used internally to avoid
+ * deprecation warnings. These functions are exactly equivalent to
+ * their public variants, but avoid the deprecation warnings.
+ * =============================================================== */
+
+size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+
+size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx,
+                                    void* dst, size_t dstCapacity,
+                              const void* src, size_t srcSize);
+
+size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
 #endif /* ZSTD_COMPRESS_H */
diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c
index 52b0a8059aba..3e9ea46a670a 100644
--- a/lib/zstd/compress/zstd_compress_literals.c
+++ b/lib/zstd/compress/zstd_compress_literals.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -13,11 +14,36 @@
  ***************************************/
 #include "zstd_compress_literals.h"
 
+
+/* **************************************************************
+*  Debug Traces
+****************************************************************/
+#if DEBUGLEVEL >= 2
+
+static size_t showHexa(const void* src, size_t srcSize)
+{
+    const BYTE* const ip = (const BYTE*)src;
+    size_t u;
+    for (u=0; u<srcSize; u++) {
+        RAWLOG(5, " %02X", ip[u]); (void)ip;
+    }
+    RAWLOG(5, " \n");
+    return srcSize;
+}
+
+#endif
+
+
+/* **************************************************************
+*  Literals compression - special cases
+****************************************************************/
 size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE*)dst;
     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
 
+    DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity);
+
     RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
 
     switch(flSize)
@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src,
     }
 
     ZSTD_memcpy(ostart + flSize, src, srcSize);
-    DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+    DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
     return srcSize + flSize;
 }
 
+static int allBytesIdentical(const void* src, size_t srcSize)
+{
+    assert(srcSize >= 1);
+    assert(src != NULL);
+    {   const BYTE b = ((const BYTE*)src)[0];
+        size_t p;
+        for (p=1; p<srcSize; p++) {
+            if (((const BYTE*)src)[p] != b) return 0;
+        }
+        return 1;
+    }
+}
+
 size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
 {
     BYTE* const ostart = (BYTE*)dst;
     U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
 
-    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
+    assert(dstCapacity >= 4); (void)dstCapacity;
+    assert(allBytesIdentical(src, srcSize));
 
     switch(flSize)
     {
@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void*
     }
 
     ostart[flSize] = *(const BYTE*)src;
-    DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
+    DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1);
     return flSize+1;
 }
 
-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
-                              ZSTD_hufCTables_t* nextHuf,
-                              ZSTD_strategy strategy, int disableLiteralCompression,
-                              void* dst, size_t dstCapacity,
-                        const void* src, size_t srcSize,
-                              void* entropyWorkspace, size_t entropyWorkspaceSize,
-                        const int bmi2,
-                        unsigned suspectUncompressible)
+/* ZSTD_minLiteralsToCompress() :
+ * returns minimal amount of literals
+ * for literal compression to even be attempted.
+ * Minimum is made tighter as compression strategy increases.
+ */
+static size_t
+ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat)
+{
+    assert((int)strategy >= 0);
+    assert((int)strategy <= 9);
+    /* btultra2 : min 8 bytes;
+     * then 2x larger for each successive compression strategy
+     * max threshold 64 bytes */
+    {   int const shift = MIN(9-(int)strategy, 3);
+        size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift;
+        DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc);
+        return mintc;
+    }
+}
+
+size_t ZSTD_compressLiterals (
+                  void* dst, size_t dstCapacity,
+            const void* src, size_t srcSize,
+                  void* entropyWorkspace, size_t entropyWorkspaceSize,
+            const ZSTD_hufCTables_t* prevHuf,
+                  ZSTD_hufCTables_t* nextHuf,
+                  ZSTD_strategy strategy,
+                  int disableLiteralCompression,
+                  int suspectUncompressible,
+                  int bmi2)
 {
-    size_t const minGain = ZSTD_minGain(srcSize, strategy);
     size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
     BYTE*  const ostart = (BYTE*)dst;
     U32 singleStream = srcSize < 256;
     symbolEncodingType_e hType = set_compressed;
     size_t cLitSize;
 
-    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
-                disableLiteralCompression, (U32)srcSize);
+    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)",
+                disableLiteralCompression, (U32)srcSize, dstCapacity);
+
+    DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize));
 
     /* Prepare nextEntropy assuming reusing the existing table */
     ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
     if (disableLiteralCompression)
         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
 
-    /* small ? don't even attempt compression (speed opt) */
-#   define COMPRESS_LITERALS_SIZE_MIN 63
-    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
-        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
-    }
+    /* if too small, don't even attempt compression (speed opt) */
+    if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode))
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
 
     RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
     {   HUF_repeat repeat = prevHuf->repeatMode;
-        int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
+        int const flags = 0
+            | (bmi2 ? HUF_flags_bmi2 : 0)
+            | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0)
+            | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0)
+            | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0);
+
+        typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int);
+        huf_compress_f huf_compress;
         if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
-        cLitSize = singleStream ?
-            HUF_compress1X_repeat(
-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) :
-            HUF_compress4X_repeat(
-                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
-                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible);
+        huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat;
+        cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize,
+                                src, srcSize,
+                                HUF_SYMBOLVALUE_MAX, LitHufLog,
+                                entropyWorkspace, entropyWorkspaceSize,
+                                (HUF_CElt*)nextHuf->CTable,
+                                &repeat, flags);
+        DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize);
         if (repeat != HUF_repeat_none) {
             /* reused the existing table */
-            DEBUGLOG(5, "Reusing previous huffman table");
+            DEBUGLOG(5, "reusing statistics from previous huffman block");
             hType = set_repeat;
         }
     }
 
-    if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
-    }
+    {   size_t const minGain = ZSTD_minGain(srcSize, strategy);
+        if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) {
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }   }
     if (cLitSize==1) {
-        ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
-    }
+        /* A return value of 1 signals that the alphabet consists of a single symbol.
+         * However, in some rare circumstances, it could be the compressed size (a single byte).
+         * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`.
+         * (it's also necessary to not generate statistics).
+         * Therefore, in such a case, actively check that all bytes are identical. */
+        if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) {
+            ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+            return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+    }   }
 
     if (hType == set_compressed) {
         /* using a newly constructed table */
@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
     switch(lhSize)
     {
     case 3: /* 2 - 2 - 10 - 10 */
-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+        if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
+        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
             MEM_writeLE24(ostart, lhc);
             break;
         }
     case 4: /* 2 - 2 - 14 - 14 */
+        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
         {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
             MEM_writeLE32(ostart, lhc);
             break;
         }
     case 5: /* 2 - 2 - 18 - 18 */
+        assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS);
         {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
             MEM_writeLE32(ostart, lhc);
             ostart[4] = (BYTE)(cLitSize >> 10);
diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h
index 9775fb97cb70..a2a85d6b69e5 100644
--- a/lib/zstd/compress/zstd_compress_literals.h
+++ b/lib/zstd/compress/zstd_compress_literals.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,16 +17,24 @@
 
 size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
+/* ZSTD_compressRleLiteralsBlock() :
+ * Conditions :
+ * - All bytes in @src are identical
+ * - dstCapacity >= 4 */
 size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
 
-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */
-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
-                              ZSTD_hufCTables_t* nextHuf,
-                              ZSTD_strategy strategy, int disableLiteralCompression,
-                              void* dst, size_t dstCapacity,
+/* ZSTD_compressLiterals():
+ * @entropyWorkspace: must be aligned on 4-bytes boundaries
+ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE
+ * @suspectUncompressible: sampling checks, to potentially skip huffman coding
+ */
+size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity,
                         const void* src, size_t srcSize,
                               void* entropyWorkspace, size_t entropyWorkspaceSize,
-                        const int bmi2,
-                        unsigned suspectUncompressible);
+                        const ZSTD_hufCTables_t* prevHuf,
+                              ZSTD_hufCTables_t* nextHuf,
+                              ZSTD_strategy strategy, int disableLiteralCompression,
+                              int suspectUncompressible,
+                              int bmi2);
 
 #endif /* ZSTD_COMPRESS_LITERALS_H */
diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c
index 21ddc1b37acf..5c028c78d889 100644
--- a/lib/zstd/compress/zstd_compress_sequences.c
+++ b/lib/zstd/compress/zstd_compress_sequences.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq)
 {
     /* Heuristic: This should cover most blocks <= 16K and
      * start to fade out after 16K to about 32K depending on
-     * comprssibility.
+     * compressibility.
      */
     return nbSeq >= 2048;
 }
@@ -166,7 +167,7 @@ ZSTD_selectEncodingType(
     if (mostFrequent == nbSeq) {
         *repeatMode = FSE_repeat_none;
         if (isDefaultAllowed && nbSeq <= 2) {
-            /* Prefer set_basic over set_rle when there are 2 or less symbols,
+            /* Prefer set_basic over set_rle when there are 2 or fewer symbols,
              * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
              * If basic encoding isn't possible, always choose RLE.
              */
diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h
index 7991364c2f71..7fe6f4ff5cf2 100644
--- a/lib/zstd/compress/zstd_compress_sequences.h
+++ b/lib/zstd/compress/zstd_compress_sequences.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c
index 17d836cc84e8..41f6521b27cd 100644
--- a/lib/zstd/compress/zstd_compress_superblock.c
+++ b/lib/zstd/compress/zstd_compress_superblock.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -36,13 +37,14 @@
  *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
  *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
  *  @return : compressed size of literals section of a sub-block
- *            Or 0 if it unable to compress.
+ *            Or 0 if unable to compress.
  *            Or error code */
-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
-                                    const ZSTD_hufCTablesMetadata_t* hufMetadata,
-                                    const BYTE* literals, size_t litSize,
-                                    void* dst, size_t dstSize,
-                                    const int bmi2, int writeEntropy, int* entropyWritten)
+static size_t
+ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+                              const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                              const BYTE* literals, size_t litSize,
+                              void* dst, size_t dstSize,
+                              const int bmi2, int writeEntropy, int* entropyWritten)
 {
     size_t const header = writeEntropy ? 200 : 0;
     size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
     symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
     size_t cLitSize = 0;
 
-    (void)bmi2; /* TODO bmi2... */
-
     DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
 
     *entropyWritten = 0;
@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
         DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
     }
 
-    /* TODO bmi2 */
-    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
-                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
+    {   int const flags = bmi2 ? HUF_flags_bmi2 : 0;
+        const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags)
+                                          : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags);
         op += cSize;
         cLitSize += cSize;
         if (cSize == 0 || ERR_isError(cSize)) {
@@ -103,7 +103,7 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
     switch(lhSize)
     {
     case 3: /* 2 - 2 - 10 - 10 */
-        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
+        {   U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
             MEM_writeLE24(ostart, lhc);
             break;
         }
@@ -123,26 +123,30 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
     }
     *entropyWritten = 1;
     DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
-    return op-ostart;
+    return (size_t)(op-ostart);
 }
 
-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
-    const seqDef* const sstart = sequences;
-    const seqDef* const send = sequences + nbSeq;
-    const seqDef* sp = sstart;
+static size_t
+ZSTD_seqDecompressedSize(seqStore_t const* seqStore,
+                   const seqDef* sequences, size_t nbSeqs,
+                         size_t litSize, int lastSubBlock)
+{
     size_t matchLengthSum = 0;
     size_t litLengthSum = 0;
-    (void)(litLengthSum); /* suppress unused variable warning on some environments */
-    while (send-sp > 0) {
-        ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
+    size_t n;
+    for (n=0; n<nbSeqs; n++) {
+        const ZSTD_sequenceLength seqLen = ZSTD_getSequenceLength(seqStore, sequences+n);
         litLengthSum += seqLen.litLength;
         matchLengthSum += seqLen.matchLength;
-        sp++;
     }
-    assert(litLengthSum <= litSize);
-    if (!lastSequence) {
+    DEBUGLOG(5, "ZSTD_seqDecompressedSize: %u sequences from %p: %u literals + %u matchlength",
+                (unsigned)nbSeqs, (const void*)sequences,
+                (unsigned)litLengthSum, (unsigned)matchLengthSum);
+    if (!lastSubBlock)
         assert(litLengthSum == litSize);
-    }
+    else
+        assert(litLengthSum <= litSize);
+    (void)litLengthSum;
     return matchLengthSum + litSize;
 }
 
@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef*
  *  @return : compressed size of sequences section of a sub-block
  *            Or 0 if it is unable to compress
  *            Or error code. */
-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
-                                              const ZSTD_fseCTablesMetadata_t* fseMetadata,
-                                              const seqDef* sequences, size_t nbSeq,
-                                              const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
-                                              const ZSTD_CCtx_params* cctxParams,
-                                              void* dst, size_t dstCapacity,
-                                              const int bmi2, int writeEntropy, int* entropyWritten)
+static size_t
+ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+                                const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                const seqDef* sequences, size_t nbSeq,
+                                const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                const ZSTD_CCtx_params* cctxParams,
+                                void* dst, size_t dstCapacity,
+                                const int bmi2, int writeEntropy, int* entropyWritten)
 {
     const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
     BYTE* const ostart = (BYTE*)dst;
@@ -176,14 +181,14 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
     /* Sequences Header */
     RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
                     dstSize_tooSmall, "");
-    if (nbSeq < 0x7F)
+    if (nbSeq < 128)
         *op++ = (BYTE)nbSeq;
     else if (nbSeq < LONGNBSEQ)
         op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
     else
         op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
     if (nbSeq==0) {
-        return op - ostart;
+        return (size_t)(op - ostart);
     }
 
     /* seqHead : flags for FSE encoding type */
@@ -205,7 +210,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
     }
 
     {   size_t const bitstreamSize = ZSTD_encodeSequences(
-                                        op, oend - op,
+                                        op, (size_t)(oend - op),
                                         fseTables->matchlengthCTable, mlCode,
                                         fseTables->offcodeCTable, ofCode,
                                         fseTables->litlengthCTable, llCode,
@@ -249,7 +254,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables
 #endif
 
     *entropyWritten = 1;
-    return op - ostart;
+    return (size_t)(op - ostart);
 }
 
 /* ZSTD_compressSubBlock() :
@@ -275,7 +280,8 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
                 litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
     {   size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
                                                         &entropyMetadata->hufMetadata, literals, litSize,
-                                                        op, oend-op, bmi2, writeLitEntropy, litEntropyWritten);
+                                                        op, (size_t)(oend-op),
+                                                        bmi2, writeLitEntropy, litEntropyWritten);
         FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed");
         if (cLitSize == 0) return 0;
         op += cLitSize;
@@ -285,18 +291,18 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
                                                   sequences, nbSeq,
                                                   llCode, mlCode, ofCode,
                                                   cctxParams,
-                                                  op, oend-op,
+                                                  op, (size_t)(oend-op),
                                                   bmi2, writeSeqEntropy, seqEntropyWritten);
         FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed");
         if (cSeqSize == 0) return 0;
         op += cSeqSize;
     }
     /* Write block header */
-    {   size_t cSize = (op-ostart)-ZSTD_blockHeaderSize;
+    {   size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize;
         U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
         MEM_writeLE24(ostart, cBlockHeader24);
     }
-    return op-ostart;
+    return (size_t)(op-ostart);
 }
 
 static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
@@ -385,7 +391,11 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,
     return cSeqSizeEstimate + sequencesSectionHeaderSize;
 }
 
-static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+typedef struct {
+    size_t estLitSize;
+    size_t estBlockSize;
+} EstimatedBlockSize;
+static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
                                         const BYTE* ofCodeTable,
                                         const BYTE* llCodeTable,
                                         const BYTE* mlCodeTable,
@@ -393,15 +403,17 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
                                         const ZSTD_entropyCTables_t* entropy,
                                         const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
                                         void* workspace, size_t wkspSize,
-                                        int writeLitEntropy, int writeSeqEntropy) {
-    size_t cSizeEstimate = 0;
-    cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
-                                                         &entropy->huf, &entropyMetadata->hufMetadata,
-                                                         workspace, wkspSize, writeLitEntropy);
-    cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+                                        int writeLitEntropy, int writeSeqEntropy)
+{
+    EstimatedBlockSize ebs;
+    ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize,
+                                                        &entropy->huf, &entropyMetadata->hufMetadata,
+                                                        workspace, wkspSize, writeLitEntropy);
+    ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
                                                          nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
                                                          workspace, wkspSize, writeSeqEntropy);
-    return cSizeEstimate + ZSTD_blockHeaderSize;
+    ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize;
+    return ebs;
 }
 
 static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata)
@@ -415,13 +427,56 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe
     return 0;
 }
 
+static size_t countLiterals(seqStore_t const* seqStore, const seqDef* sp, size_t seqCount)
+{
+    size_t n, total = 0;
+    assert(sp != NULL);
+    for (n=0; n<seqCount; n++) {
+        total += ZSTD_getSequenceLength(seqStore, sp+n).litLength;
+    }
+    DEBUGLOG(6, "countLiterals for %zu sequences from %p => %zu bytes", seqCount, (const void*)sp, total);
+    return total;
+}
+
+#define BYTESCALE 256
+
+static size_t sizeBlockSequences(const seqDef* sp, size_t nbSeqs,
+                size_t targetBudget, size_t avgLitCost, size_t avgSeqCost,
+                int firstSubBlock)
+{
+    size_t n, budget = 0, inSize=0;
+    /* entropy headers */
+    size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */
+    assert(firstSubBlock==0 || firstSubBlock==1);
+    budget += headerSize;
+
+    /* first sequence => at least one sequence*/
+    budget += sp[0].litLength * avgLitCost + avgSeqCost;
+    if (budget > targetBudget) return 1;
+    inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH);
+
+    /* loop over sequences */
+    for (n=1; n<nbSeqs; n++) {
+        size_t currentCost = sp[n].litLength * avgLitCost + avgSeqCost;
+        budget += currentCost;
+        inSize += sp[n].litLength + (sp[n].mlBase+MINMATCH);
+        /* stop when sub-block budget is reached */
+        if ( (budget > targetBudget)
+            /* though continue to expand until the sub-block is deemed compressible */
+          && (budget < inSize * BYTESCALE) )
+            break;
+    }
+
+    return n;
+}
+
 /* ZSTD_compressSubBlock_multi() :
  *  Breaks super-block into multiple sub-blocks and compresses them.
- *  Entropy will be written to the first block.
- *  The following blocks will use repeat mode to compress.
- *  All sub-blocks are compressed blocks (no raw or rle blocks).
- *  @return : compressed size of the super block (which is multiple ZSTD blocks)
- *            Or 0 if it failed to compress. */
+ *  Entropy will be written into the first block.
+ *  The following blocks use repeat_mode to compress.
+ *  Sub-blocks are all compressed, except the last one when beneficial.
+ *  @return : compressed size of the super block (which features multiple ZSTD blocks)
+ *            or 0 if it failed to compress. */
 static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
                             const ZSTD_compressedBlockState_t* prevCBlock,
                             ZSTD_compressedBlockState_t* nextCBlock,
@@ -434,10 +489,12 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
 {
     const seqDef* const sstart = seqStorePtr->sequencesStart;
     const seqDef* const send = seqStorePtr->sequences;
-    const seqDef* sp = sstart;
+    const seqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */
+    size_t const nbSeqs = (size_t)(send - sstart);
     const BYTE* const lstart = seqStorePtr->litStart;
     const BYTE* const lend = seqStorePtr->lit;
     const BYTE* lp = lstart;
+    size_t const nbLiterals = (size_t)(lend - lstart);
     BYTE const* ip = (BYTE const*)src;
     BYTE const* const iend = ip + srcSize;
     BYTE* const ostart = (BYTE*)dst;
@@ -446,112 +503,171 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
     const BYTE* llCodePtr = seqStorePtr->llCode;
     const BYTE* mlCodePtr = seqStorePtr->mlCode;
     const BYTE* ofCodePtr = seqStorePtr->ofCode;
-    size_t targetCBlockSize = cctxParams->targetCBlockSize;
-    size_t litSize, seqCount;
-    int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
+    size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */
+    size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize);
+    int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed);
     int writeSeqEntropy = 1;
-    int lastSequence = 0;
-
-    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
-                (unsigned)(lend-lp), (unsigned)(send-sstart));
-
-    litSize = 0;
-    seqCount = 0;
-    do {
-        size_t cBlockSizeEstimate = 0;
-        if (sstart == send) {
-            lastSequence = 1;
-        } else {
-            const seqDef* const sequence = sp + seqCount;
-            lastSequence = sequence == send - 1;
-            litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
-            seqCount++;
-        }
-        if (lastSequence) {
-            assert(lp <= lend);
-            assert(litSize <= (size_t)(lend - lp));
-            litSize = (size_t)(lend - lp);
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)",
+               (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart));
+
+        /* let's start by a general estimation for the full block */
+    if (nbSeqs > 0) {
+        EstimatedBlockSize const ebs =
+                ZSTD_estimateSubBlockSize(lp, nbLiterals,
+                                        ofCodePtr, llCodePtr, mlCodePtr, nbSeqs,
+                                        &nextCBlock->entropy, entropyMetadata,
+                                        workspace, wkspSize,
+                                        writeLitEntropy, writeSeqEntropy);
+        /* quick estimation */
+        size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE;
+        size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs;
+        const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1);
+        size_t n, avgBlockBudget, blockBudgetSupp=0;
+        avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks;
+        DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes",
+                    (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE,
+                    (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE);
+        /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately
+         * this will result in the production of a single uncompressed block covering @srcSize.*/
+        if (ebs.estBlockSize > srcSize) return 0;
+
+        /* compress and write sub-blocks */
+        assert(nbSubBlocks>0);
+        for (n=0; n < nbSubBlocks-1; n++) {
+            /* determine nb of sequences for current sub-block + nbLiterals from next sequence */
+            size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp),
+                                        avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0);
+            /* if reached last sequence : break to last sub-block (simplification) */
+            assert(seqCount <= (size_t)(send-sp));
+            if (sp + seqCount == send) break;
+            assert(seqCount > 0);
+            /* compress sub-block */
+            {   int litEntropyWritten = 0;
+                int seqEntropyWritten = 0;
+                size_t litSize = countLiterals(seqStorePtr, sp, seqCount);
+                const size_t decompressedSize =
+                        ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0);
+                size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
+                                                sp, seqCount,
+                                                lp, litSize,
+                                                llCodePtr, mlCodePtr, ofCodePtr,
+                                                cctxParams,
+                                                op, (size_t)(oend-op),
+                                                bmi2, writeLitEntropy, writeSeqEntropy,
+                                                &litEntropyWritten, &seqEntropyWritten,
+                                                0);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
+
+                /* check compressibility, update state components */
+                if (cSize > 0 && cSize < decompressedSize) {
+                    DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes",
+                                (unsigned)decompressedSize, (unsigned)cSize);
+                    assert(ip + decompressedSize <= iend);
+                    ip += decompressedSize;
+                    lp += litSize;
+                    op += cSize;
+                    llCodePtr += seqCount;
+                    mlCodePtr += seqCount;
+                    ofCodePtr += seqCount;
+                    /* Entropy only needs to be written once */
+                    if (litEntropyWritten) {
+                        writeLitEntropy = 0;
+                    }
+                    if (seqEntropyWritten) {
+                        writeSeqEntropy = 0;
+                    }
+                    sp += seqCount;
+                    blockBudgetSupp = 0;
+            }   }
+            /* otherwise : do not compress yet, coalesce current sub-block with following one */
         }
-        /* I think there is an optimization opportunity here.
-         * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
-         * since it recalculates estimate from scratch.
-         * For example, it would recount literal distribution and symbol codes every time.
-         */
-        cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
-                                                       &nextCBlock->entropy, entropyMetadata,
-                                                       workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
-        if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
-            int litEntropyWritten = 0;
-            int seqEntropyWritten = 0;
-            const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
-            const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
-                                                       sp, seqCount,
-                                                       lp, litSize,
-                                                       llCodePtr, mlCodePtr, ofCodePtr,
-                                                       cctxParams,
-                                                       op, oend-op,
-                                                       bmi2, writeLitEntropy, writeSeqEntropy,
-                                                       &litEntropyWritten, &seqEntropyWritten,
-                                                       lastBlock && lastSequence);
-            FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
-            if (cSize > 0 && cSize < decompressedSize) {
-                DEBUGLOG(5, "Committed the sub-block");
-                assert(ip + decompressedSize <= iend);
-                ip += decompressedSize;
-                sp += seqCount;
-                lp += litSize;
-                op += cSize;
-                llCodePtr += seqCount;
-                mlCodePtr += seqCount;
-                ofCodePtr += seqCount;
-                litSize = 0;
-                seqCount = 0;
-                /* Entropy only needs to be written once */
-                if (litEntropyWritten) {
-                    writeLitEntropy = 0;
-                }
-                if (seqEntropyWritten) {
-                    writeSeqEntropy = 0;
-                }
+    } /* if (nbSeqs > 0) */
+
+    /* write last block */
+    DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp));
+    {   int litEntropyWritten = 0;
+        int seqEntropyWritten = 0;
+        size_t litSize = (size_t)(lend - lp);
+        size_t seqCount = (size_t)(send - sp);
+        const size_t decompressedSize =
+                ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1);
+        size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
+                                            sp, seqCount,
+                                            lp, litSize,
+                                            llCodePtr, mlCodePtr, ofCodePtr,
+                                            cctxParams,
+                                            op, (size_t)(oend-op),
+                                            bmi2, writeLitEntropy, writeSeqEntropy,
+                                            &litEntropyWritten, &seqEntropyWritten,
+                                            lastBlock);
+        FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
+
+        /* update pointers, the nb of literals borrowed from next sequence must be preserved */
+        if (cSize > 0 && cSize < decompressedSize) {
+            DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes",
+                        (unsigned)decompressedSize, (unsigned)cSize);
+            assert(ip + decompressedSize <= iend);
+            ip += decompressedSize;
+            lp += litSize;
+            op += cSize;
+            llCodePtr += seqCount;
+            mlCodePtr += seqCount;
+            ofCodePtr += seqCount;
+            /* Entropy only needs to be written once */
+            if (litEntropyWritten) {
+                writeLitEntropy = 0;
             }
+            if (seqEntropyWritten) {
+                writeSeqEntropy = 0;
+            }
+            sp += seqCount;
         }
-    } while (!lastSequence);
+    }
+
+
     if (writeLitEntropy) {
-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
+        DEBUGLOG(5, "Literal entropy tables were never written");
         ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
     }
     if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
         /* If we haven't written our entropy tables, then we've violated our contract and
          * must emit an uncompressed block.
          */
-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
+        DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block");
         return 0;
     }
+
     if (ip < iend) {
-        size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
-        DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
+        /* some data left : last part of the block sent uncompressed */
+        size_t const rSize = (size_t)((iend - ip));
+        size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock);
+        DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize));
         FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
         assert(cSize != 0);
         op += cSize;
         /* We have to regenerate the repcodes because we've skipped some sequences */
         if (sp < send) {
-            seqDef const* seq;
+            const seqDef* seq;
             repcodes_t rep;
             ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
             for (seq = sstart; seq < sp; ++seq) {
-                ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+                ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
             }
             ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
         }
     }
-    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
-    return op-ostart;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u",
+                (unsigned)(op-ostart));
+    return (size_t)(op-ostart);
 }
 
 size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
                                void* dst, size_t dstCapacity,
-                               void const* src, size_t srcSize,
-                               unsigned lastBlock) {
+                               const void* src, size_t srcSize,
+                               unsigned lastBlock)
+{
     ZSTD_entropyCTablesMetadata_t entropyMetadata;
 
     FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore,
diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h
index 224ece79546e..826bbc9e029b 100644
--- a/lib/zstd/compress/zstd_compress_superblock.h
+++ b/lib/zstd/compress/zstd_compress_superblock.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
index 349fc923c355..86bc3c2c23c7 100644
--- a/lib/zstd/compress/zstd_cwksp.h
+++ b/lib/zstd/compress/zstd_cwksp.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,7 +15,9 @@
 /*-*************************************
 *  Dependencies
 ***************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
 #include "../common/zstd_internal.h"
+#include "../common/portability_macros.h"
 
 
 /*-*************************************
@@ -41,8 +44,9 @@
 ***************************************/
 typedef enum {
     ZSTD_cwksp_alloc_objects,
-    ZSTD_cwksp_alloc_buffers,
-    ZSTD_cwksp_alloc_aligned
+    ZSTD_cwksp_alloc_aligned_init_once,
+    ZSTD_cwksp_alloc_aligned,
+    ZSTD_cwksp_alloc_buffers
 } ZSTD_cwksp_alloc_phase_e;
 
 /*
@@ -95,8 +99,8 @@ typedef enum {
  *
  * Workspace Layout:
  *
- * [                        ... workspace ...                         ]
- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
+ * [                        ... workspace ...                           ]
+ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once]
  *
  * The various objects that live in the workspace are divided into the
  * following categories, and are allocated separately:
@@ -120,9 +124,18 @@ typedef enum {
  *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
  *   Their sizes depend on the cparams. These tables are 64-byte aligned.
  *
- * - Aligned: these buffers are used for various purposes that require 4 byte
- *   alignment, but don't require any initialization before they're used. These
- *   buffers are each aligned to 64 bytes.
+ * - Init once: these buffers require to be initialized at least once before
+ *   use. They should be used when we want to skip memory initialization
+ *   while not triggering memory checkers (like Valgrind) when reading from
+ *   from this memory without writing to it first.
+ *   These buffers should be used carefully as they might contain data
+ *   from previous compressions.
+ *   Buffers are aligned to 64 bytes.
+ *
+ * - Aligned: these buffers don't require any initialization before they're
+ *   used. The user of the buffer should make sure they write into a buffer
+ *   location before reading from it.
+ *   Buffers are aligned to 64 bytes.
  *
  * - Buffers: these buffers are used for various purposes that don't require
  *   any alignment or initialization before they're used. This means they can
@@ -134,8 +147,9 @@ typedef enum {
  * correctly packed into the workspace buffer. That order is:
  *
  * 1. Objects
- * 2. Buffers
- * 3. Aligned/Tables
+ * 2. Init once / Tables
+ * 3. Aligned / Tables
+ * 4. Buffers / Tables
  *
  * Attempts to reserve objects of different types out of order will fail.
  */
@@ -147,6 +161,7 @@ typedef struct {
     void* tableEnd;
     void* tableValidEnd;
     void* allocStart;
+    void* initOnceStart;
 
     BYTE allocFailed;
     int workspaceOversizedDuration;
@@ -159,6 +174,7 @@ typedef struct {
 ***************************************/
 
 MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
+MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws);
 
 MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
     (void)ws;
@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
     assert(ws->tableEnd <= ws->allocStart);
     assert(ws->tableValidEnd <= ws->allocStart);
     assert(ws->allocStart <= ws->workspaceEnd);
+    assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws));
+    assert(ws->workspace <= ws->initOnceStart);
 }
 
 /*
@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
  * for internal purposes (currently only alignment).
  */
 MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
-    /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes
-     * to align the beginning of tables section, as well as another n_2=[0, 63] bytes
-     * to align the beginning of the aligned section.
-     *
-     * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and
-     * aligneds being sized in multiples of 64 bytes.
+    /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES
+     * bytes to align the beginning of tables section and end of buffers;
      */
-    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES;
+    size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2;
     return slackSpace;
 }
 
@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt
     size_t const alignBytesMask = alignBytes - 1;
     size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask;
     assert((alignBytes & alignBytesMask) == 0);
-    assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES);
+    assert(bytes < alignBytes);
     return bytes;
 }
 
+/*
+ * Returns the initial value for allocStart which is used to determine the position from
+ * which we can allocate from the end of the workspace.
+ */
+MEM_STATIC void*  ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) {
+    return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1));
+}
+
 /*
  * Internal function. Do not use directly.
  * Reserves the given number of bytes within the aligned/buffer segment of the wksp,
@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
 {
     assert(phase >= ws->phase);
     if (phase > ws->phase) {
-        /* Going from allocating objects to allocating buffers */
-        if (ws->phase < ZSTD_cwksp_alloc_buffers &&
-                phase >= ZSTD_cwksp_alloc_buffers) {
+        /* Going from allocating objects to allocating initOnce / tables */
+        if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once &&
+            phase >= ZSTD_cwksp_alloc_aligned_init_once) {
             ws->tableValidEnd = ws->objectEnd;
-        }
+            ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
 
-        /* Going from allocating buffers to allocating aligneds/tables */
-        if (ws->phase < ZSTD_cwksp_alloc_aligned &&
-                phase >= ZSTD_cwksp_alloc_aligned) {
-            {   /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */
-                size_t const bytesToAlign =
-                    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
-                DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign);
-                ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */
-                RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign),
-                                memory_allocation, "aligned phase - alignment initial allocation failed!");
-            }
             {   /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */
-                void* const alloc = ws->objectEnd;
+                void *const alloc = ws->objectEnd;
                 size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
-                void* const objectEnd = (BYTE*)alloc + bytesToAlign;
+                void *const objectEnd = (BYTE *) alloc + bytesToAlign;
                 DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign);
                 RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation,
                                 "table phase - alignment initial allocation failed!");
@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
                 ws->tableEnd = objectEnd;  /* table area starts being empty */
                 if (ws->tableValidEnd < ws->tableEnd) {
                     ws->tableValidEnd = ws->tableEnd;
-        }   }   }
+                }
+            }
+        }
         ws->phase = phase;
         ZSTD_cwksp_assert_internal_consistency(ws);
     }
@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase
  */
 MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr)
 {
-    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
+    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd);
 }
 
 /*
@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
     return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
 }
 
+/*
+ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
+ * This memory has been initialized at least once in the past.
+ * This doesn't mean it has been initialized this time, and it might contain data from previous
+ * operations.
+ * The main usage is for algorithms that might need read access into uninitialized memory.
+ * The algorithm must maintain safety under these conditions and must make sure it doesn't
+ * leak any of the past data (directly or in side channels).
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes)
+{
+    size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES);
+    void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once);
+    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0);
+    if(ptr && ptr < ws->initOnceStart) {
+        /* We assume the memory following the current allocation is either:
+         * 1. Not usable as initOnce memory (end of workspace)
+         * 2. Another initOnce buffer that has been allocated before (and so was previously memset)
+         * 3. An ASAN redzone, in which case we don't want to write on it
+         * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart.
+         * Note that we assume here that MSAN and ASAN cannot run in the same time. */
+        ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes));
+        ws->initOnceStart = ptr;
+    }
+    return ptr;
+}
+
 /*
  * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes).
  */
@@ -356,18 +396,22 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
 
 /*
  * Aligned on 64 bytes. These buffers have the special property that
- * their values remain constrained, allowing us to re-use them without
+ * their values remain constrained, allowing us to reuse them without
  * memset()-ing them.
  */
 MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
 {
-    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
+    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once;
     void* alloc;
     void* end;
     void* top;
 
-    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
-        return NULL;
+    /* We can only start allocating tables after we are done reserving space for objects at the
+     * start of the workspace */
+    if(ws->phase < phase) {
+        if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+            return NULL;
+        }
     }
     alloc = ws->tableEnd;
     end = (BYTE *)alloc + bytes;
@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
     assert(ws->tableValidEnd >= ws->objectEnd);
     assert(ws->tableValidEnd <= ws->allocStart);
     if (ws->tableValidEnd < ws->tableEnd) {
-        ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
+        ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd));
     }
     ZSTD_cwksp_mark_tables_clean(ws);
 }
@@ -478,14 +522,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
 
 
     ws->tableEnd = ws->objectEnd;
-    ws->allocStart = ws->workspaceEnd;
+    ws->allocStart = ZSTD_cwksp_initialAllocStart(ws);
     ws->allocFailed = 0;
-    if (ws->phase > ZSTD_cwksp_alloc_buffers) {
-        ws->phase = ZSTD_cwksp_alloc_buffers;
+    if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) {
+        ws->phase = ZSTD_cwksp_alloc_aligned_init_once;
     }
     ZSTD_cwksp_assert_internal_consistency(ws);
 }
 
+MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
+    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
+}
+
+MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
+    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
+         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
+}
+
 /*
  * The provided workspace takes ownership of the buffer [start, start+size).
  * Any existing values in the workspace are ignored (the previously managed
@@ -498,6 +551,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c
     ws->workspaceEnd = (BYTE*)start + size;
     ws->objectEnd = ws->workspace;
     ws->tableValidEnd = ws->objectEnd;
+    ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws);
     ws->phase = ZSTD_cwksp_alloc_objects;
     ws->isStatic = isStatic;
     ZSTD_cwksp_clear(ws);
@@ -529,15 +583,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) {
     ZSTD_memset(src, 0, sizeof(ZSTD_cwksp));
 }
 
-MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
-    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
-}
-
-MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) {
-    return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace)
-         + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart);
-}
-
 MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
     return ws->allocFailed;
 }
@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
  * Returns if the estimated space needed for a wksp is within an acceptable limit of the
  * actual amount of space used.
  */
-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws,
-                                                        size_t const estimatedSpace, int resizedWorkspace) {
-    if (resizedWorkspace) {
-        /* Resized/newly allocated wksp should have exact bounds */
-        return ZSTD_cwksp_used(ws) == estimatedSpace;
-    } else {
-        /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes
-         * than estimatedSpace. See the comments in zstd_cwksp.h for details.
-         */
-        return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63);
-    }
+MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) {
+    /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice
+     * the alignment bytes difference between estimation and actual usage */
+    return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) &&
+           ZSTD_cwksp_used(ws) <= estimatedSpace;
 }
 
 
diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
index 76933dea2624..5ff54f17d92f 100644
--- a/lib/zstd/compress/zstd_double_fast.c
+++ b/lib/zstd/compress/zstd_double_fast.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,8 +12,49 @@
 #include "zstd_compress_internal.h"
 #include "zstd_double_fast.h"
 
+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
 
-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0) {
+                ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i);
+            }
+            if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {
+                ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i);
+            }
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
+    }   }
+}
+
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms,
                               void const* end, ZSTD_dictTableLoadMethod_e dtlm)
 {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -43,11 +85,24 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
             /* Only load extra positions for ZSTD_dtlm_full */
             if (dtlm == ZSTD_dtlm_fast)
                 break;
-    }   }
+        }   }
+}
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm);
+    }
 }
 
 
 FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
 size_t ZSTD_compressBlock_doubleFast_noDict_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize, U32 const mls /* template */)
@@ -67,7 +122,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
-    U32 offsetSaved = 0;
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
 
     size_t mLength;
     U32 offset;
@@ -100,8 +155,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
         U32 const current = (U32)(ip - base);
         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
         U32 const maxRep = current - windowLow;
-        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
-        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
     }
 
     /* Outer Loop: one iteration per match found and stored */
@@ -131,7 +186,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
             if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) {
                 mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
                 ip++;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
                 goto _match_stored;
             }
 
@@ -175,9 +230,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
         } while (ip1 <= ilimit);
 
 _cleanup:
+        /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+         * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+        offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
         /* save reps for next block */
-        rep[0] = offset_1 ? offset_1 : offsetSaved;
-        rep[1] = offset_2 ? offset_2 : offsetSaved;
+        rep[0] = offset_1 ? offset_1 : offsetSaved1;
+        rep[1] = offset_2 ? offset_2 : offsetSaved2;
 
         /* Return the last literals size */
         return (size_t)(iend - anchor);
@@ -217,7 +276,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
             hashLong[hl1] = (U32)(ip1 - base);
         }
 
-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
 _match_stored:
         /* match found */
@@ -243,7 +302,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
                 U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
                 hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
                 hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength);
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
                 ip += rLength;
                 anchor = ip;
                 continue;   /* faster when present ... (?) */
@@ -254,6 +313,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic(
 
 
 FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
 size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize,
@@ -275,7 +335,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
-    U32 offsetSaved = 0;
 
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
     const ZSTD_compressionParameters* const dictCParams = &dms->cParams;
@@ -286,8 +345,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     const BYTE* const dictStart    = dictBase + dictStartIndex;
     const BYTE* const dictEnd      = dms->window.nextSrc;
     const U32 dictIndexDelta       = prefixLowestIndex - (U32)(dictEnd - dictBase);
-    const U32 dictHBitsL           = dictCParams->hashLog;
-    const U32 dictHBitsS           = dictCParams->chainLog;
+    const U32 dictHBitsL           = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    const U32 dictHBitsS           = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS;
     const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
 
     DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
@@ -295,6 +354,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     /* if a dictionary is attached, it must be within window range */
     assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
 
+    if (ms->prefetchCDictTables) {
+        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
+        size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32);
+        PREFETCH_AREA(dictHashLong, hashTableBytes);
+        PREFETCH_AREA(dictHashSmall, chainTableBytes);
+    }
+
     /* init */
     ip += (dictAndPrefixLength == 0);
 
@@ -309,8 +375,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         U32 offset;
         size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
         size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
-        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
-        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+        size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+        size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+        U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS];
+        U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL);
+        int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS);
         U32 const curr = (U32)(ip-base);
         U32 const matchIndexL = hashLong[h2];
         U32 matchIndexS = hashSmall[h];
@@ -328,7 +398,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
             const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
             ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
             goto _match_stored;
         }
 
@@ -340,9 +410,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                 while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
                 goto _match_found;
             }
-        } else {
+        } else if (dictTagsMatchL) {
             /* check dictMatchState long match */
-            U32 const dictMatchIndexL = dictHashLong[dictHL];
+            U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS;
             const BYTE* dictMatchL = dictBase + dictMatchIndexL;
             assert(dictMatchL < dictEnd);
 
@@ -358,9 +428,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
             if (MEM_read32(match) == MEM_read32(ip)) {
                 goto _search_next_long;
             }
-        } else {
+        } else if (dictTagsMatchS) {
             /* check dictMatchState short match */
-            U32 const dictMatchIndexS = dictHashSmall[dictHS];
+            U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS;
             match = dictBase + dictMatchIndexS;
             matchIndexS = dictMatchIndexS + dictIndexDelta;
 
@@ -375,10 +445,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         continue;
 
 _search_next_long:
-
         {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
-            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+            size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
             U32 const matchIndexL3 = hashLong[hl3];
+            U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3);
             const BYTE* matchL3 = base + matchIndexL3;
             hashLong[hl3] = curr + 1;
 
@@ -391,9 +462,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                     while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
                     goto _match_found;
                 }
-            } else {
+            } else if (dictTagsMatchL3) {
                 /* check dict long +1 match */
-                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
+                U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS;
                 const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
                 assert(dictMatchL3 < dictEnd);
                 if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
@@ -419,7 +490,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         offset_2 = offset_1;
         offset_1 = offset;
 
-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
 _match_stored:
         /* match found */
@@ -448,7 +519,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
                     const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
                     ip += repLength2;
@@ -461,8 +532,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
     }   /* while (ip < ilimit) */
 
     /* save reps for next block */
-    rep[0] = offset_1 ? offset_1 : offsetSaved;
-    rep[1] = offset_2 ? offset_2 : offsetSaved;
+    rep[0] = offset_1;
+    rep[1] = offset_2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -527,7 +598,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
 }
 
 
-static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_compressBlock_doubleFast_extDict_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize,
         U32 const mls /* template */)
@@ -585,7 +658,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
             const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
             mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
             ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
         } else {
             if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
                 const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
@@ -596,7 +669,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
                 offset_2 = offset_1;
                 offset_1 = offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
             } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
                 size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
@@ -621,7 +694,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                 }
                 offset_2 = offset_1;
                 offset_1 = offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
 
             } else {
                 ip += ((ip-anchor) >> kSearchStrength) + 1;
@@ -653,7 +726,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
                     size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
                     U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
                     ip += repLength2;
@@ -694,3 +767,5 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
         return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize);
     }
 }
+
+#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h
index 6822bde65a1d..b7ddc714f13e 100644
--- a/lib/zstd/compress/zstd_double_fast.h
+++ b/lib/zstd/compress/zstd_double_fast.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -15,8 +16,12 @@
 #include "../common/mem.h"      /* U32 */
 #include "zstd_compress_internal.h"     /* ZSTD_CCtx, size_t */
 
+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+
 void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
-                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                              ZSTD_tableFillPurpose_e tfp);
+
 size_t ZSTD_compressBlock_doubleFast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
@@ -27,6 +32,14 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
 
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict
+#else
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL
+#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */
 
 
 #endif /* ZSTD_DOUBLE_FAST_H */
diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
index a752e6beab52..b7a63ba4ce56 100644
--- a/lib/zstd/compress/zstd_fast.c
+++ b/lib/zstd/compress/zstd_fast.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -11,8 +12,46 @@
 #include "zstd_compress_internal.h"  /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
 #include "zstd_fast.h"
 
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
+    U32  const mls = cParams->minMatch;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
 
-void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+    /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_full);
+
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
+        U32 const curr = (U32)(ip - base);
+        {   size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
+            ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr);   }
+
+        if (dtlm == ZSTD_dtlm_fast) continue;
+        /* Only load extra positions for ZSTD_dtlm_full */
+        {   U32 p;
+            for (p = 1; p < fastHashFillStep; ++p) {
+                size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
+                if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) {  /* not yet filled */
+                    ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
+                }   }   }   }
+}
+
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
                         const void* const end,
                         ZSTD_dictTableLoadMethod_e dtlm)
 {
@@ -25,6 +64,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
     const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
     const U32 fastHashFillStep = 3;
 
+    /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
+     * Feel free to remove this assert if there's a good reason! */
+    assert(dtlm == ZSTD_dtlm_fast);
+
     /* Always insert every fastHashFillStep position into the hash table.
      * Insert the other positions if their hash entry is empty.
      */
@@ -42,6 +85,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
     }   }   }   }
 }
 
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp)
+{
+    if (tfp == ZSTD_tfp_forCDict) {
+        ZSTD_fillHashTableForCDict(ms, end, dtlm);
+    } else {
+        ZSTD_fillHashTableForCCtx(ms, end, dtlm);
+    }
+}
+
 
 /*
  * If you squint hard enough (and ignore repcodes), the search operation at any
@@ -89,8 +144,9 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
  *
  * This is also the work we do at the beginning to enter the loop initially.
  */
-FORCE_INLINE_TEMPLATE size_t
-ZSTD_compressBlock_fast_noDict_generic(
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_compressBlock_fast_noDict_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize,
         U32 const mls, U32 const hasStep)
@@ -117,7 +173,7 @@ ZSTD_compressBlock_fast_noDict_generic(
 
     U32 rep_offset1 = rep[0];
     U32 rep_offset2 = rep[1];
-    U32 offsetSaved = 0;
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
 
     size_t hash0; /* hash for ip0 */
     size_t hash1; /* hash for ip1 */
@@ -141,8 +197,8 @@ ZSTD_compressBlock_fast_noDict_generic(
     {   U32 const curr = (U32)(ip0 - base);
         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
         U32 const maxRep = curr - windowLow;
-        if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0;
-        if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0;
+        if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
+        if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
     }
 
     /* start each op */
@@ -180,8 +236,14 @@ ZSTD_compressBlock_fast_noDict_generic(
             mLength = ip0[-1] == match0[-1];
             ip0 -= mLength;
             match0 -= mLength;
-            offcode = STORE_REPCODE_1;
+            offcode = REPCODE1_TO_OFFBASE;
             mLength += 4;
+
+            /* First write next hash table entry; we've already calculated it.
+             * This write is known to be safe because the ip1 is before the
+             * repcode (ip2). */
+            hashTable[hash1] = (U32)(ip1 - base);
+
             goto _match;
         }
 
@@ -195,6 +257,12 @@ ZSTD_compressBlock_fast_noDict_generic(
         /* check match at ip[0] */
         if (MEM_read32(ip0) == mval) {
             /* found a match! */
+
+            /* First write next hash table entry; we've already calculated it.
+             * This write is known to be safe because the ip1 == ip0 + 1, so
+             * we know we will resume searching after ip1 */
+            hashTable[hash1] = (U32)(ip1 - base);
+
             goto _offset;
         }
 
@@ -224,6 +292,21 @@ ZSTD_compressBlock_fast_noDict_generic(
         /* check match at ip[0] */
         if (MEM_read32(ip0) == mval) {
             /* found a match! */
+
+            /* first write next hash table entry; we've already calculated it */
+            if (step <= 4) {
+                /* We need to avoid writing an index into the hash table >= the
+                 * position at which we will pick up our searching after we've
+                 * taken this match.
+                 *
+                 * The minimum possible match has length 4, so the earliest ip0
+                 * can be after we take this match will be the current ip0 + 4.
+                 * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
+                 * write this position.
+                 */
+                hashTable[hash1] = (U32)(ip1 - base);
+            }
+
             goto _offset;
         }
 
@@ -254,9 +337,24 @@ ZSTD_compressBlock_fast_noDict_generic(
      * However, it seems to be a meaningful performance hit to try to search
      * them. So let's not. */
 
+    /* When the repcodes are outside of the prefix, we set them to zero before the loop.
+     * When the offsets are still zero, we need to restore them after the block to have a correct
+     * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
+     * offsets were invalid. We need to figure out which offset to refill with.
+     *     - If both offsets are zero they are in the same order.
+     *     - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
+     *     - If only one is zero, we need to decide which offset to restore.
+     *         - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
+     *         - It is impossible for rep_offset2 to be non-zero.
+     *
+     * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
+     * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
+     */
+    offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
+
     /* save reps for next block */
-    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved;
-    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved;
+    rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
+    rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -267,7 +365,7 @@ ZSTD_compressBlock_fast_noDict_generic(
     match0 = base + idx;
     rep_offset2 = rep_offset1;
     rep_offset1 = (U32)(ip0-match0);
-    offcode = STORE_OFFSET(rep_offset1);
+    offcode = OFFSET_TO_OFFBASE(rep_offset1);
     mLength = 4;
 
     /* Count the backwards match length. */
@@ -287,11 +385,6 @@ ZSTD_compressBlock_fast_noDict_generic(
     ip0 += mLength;
     anchor = ip0;
 
-    /* write next hash table entry */
-    if (ip1 < ip0) {
-        hashTable[hash1] = (U32)(ip1 - base);
-    }
-
     /* Fill table and check for immediate repcode. */
     if (ip0 <= ilimit) {
         /* Fill Table */
@@ -306,7 +399,7 @@ ZSTD_compressBlock_fast_noDict_generic(
                 { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
                 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
                 ip0 += rLength;
-                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength);
+                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
                 anchor = ip0;
                 continue;   /* faster when present (confirmed on gcc-8) ... (?) */
     }   }   }
@@ -369,6 +462,7 @@ size_t ZSTD_compressBlock_fast(
 }
 
 FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
 size_t ZSTD_compressBlock_fast_dictMatchState_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
@@ -380,14 +474,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
     const BYTE* const base = ms->window.base;
     const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart;
+    const BYTE* ip0 = istart;
+    const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
     const BYTE* anchor = istart;
     const U32   prefixStartIndex = ms->window.dictLimit;
     const BYTE* const prefixStart = base + prefixStartIndex;
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - HASH_READ_SIZE;
     U32 offset_1=rep[0], offset_2=rep[1];
-    U32 offsetSaved = 0;
 
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
     const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
@@ -397,13 +491,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     const BYTE* const dictStart    = dictBase + dictStartIndex;
     const BYTE* const dictEnd      = dms->window.nextSrc;
     const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
-    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
-    const U32 dictHLog             = dictCParams->hashLog;
+    const U32 dictAndPrefixLength  = (U32)(istart - prefixStart + dictEnd - dictStart);
+    const U32 dictHBits            = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
 
     /* if a dictionary is still attached, it necessarily means that
      * it is within window size. So we just check it. */
     const U32 maxDistance = 1U << cParams->windowLog;
-    const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
     assert(endIndex - prefixStartIndex <= maxDistance);
     (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
 
@@ -413,106 +507,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
      * when translating a dict index into a local index */
     assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
 
+    if (ms->prefetchCDictTables) {
+        size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
+        PREFETCH_AREA(dictHashTable, hashTableBytes);
+    }
+
     /* init */
     DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
-    ip += (dictAndPrefixLength == 0);
+    ip0 += (dictAndPrefixLength == 0);
     /* dictMatchState repCode checks don't currently handle repCode == 0
      * disabling. */
     assert(offset_1 <= dictAndPrefixLength);
     assert(offset_2 <= dictAndPrefixLength);
 
-    /* Main Search Loop */
-    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+    /* Outer search loop */
+    assert(stepSize >= 1);
+    while (ip1 <= ilimit) {   /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
         size_t mLength;
-        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
-        U32 const curr = (U32)(ip-base);
-        U32 const matchIndex = hashTable[h];
-        const BYTE* match = base + matchIndex;
-        const U32 repIndex = curr + 1 - offset_1;
-        const BYTE* repMatch = (repIndex < prefixStartIndex) ?
-                               dictBase + (repIndex - dictIndexDelta) :
-                               base + repIndex;
-        hashTable[h] = curr;   /* update hash table */
-
-        if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
-          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
-            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
-            ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength);
-        } else if ( (matchIndex <= prefixStartIndex) ) {
-            size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
-            U32 const dictMatchIndex = dictHashTable[dictHash];
-            const BYTE* dictMatch = dictBase + dictMatchIndex;
-            if (dictMatchIndex <= dictStartIndex ||
-                MEM_read32(dictMatch) != MEM_read32(ip)) {
-                assert(stepSize >= 1);
-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
-                continue;
-            } else {
-                /* found a dict match */
-                U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta);
-                mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
-                while (((ip>anchor) & (dictMatch>dictStart))
-                     && (ip[-1] == dictMatch[-1])) {
-                    ip--; dictMatch--; mLength++;
+        size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+
+        size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
+        U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
+        int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
+
+        U32 matchIndex = hashTable[hash0];
+        U32 curr = (U32)(ip0 - base);
+        size_t step = stepSize;
+        const size_t kStepIncr = 1 << kSearchStrength;
+        const BYTE* nextStep = ip0 + kStepIncr;
+
+        /* Inner search loop */
+        while (1) {
+            const BYTE* match = base + matchIndex;
+            const U32 repIndex = curr + 1 - offset_1;
+            const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+            const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+            size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
+            hashTable[hash0] = curr;   /* update hash table */
+
+            if (((U32) ((prefixStartIndex - 1) - repIndex) >=
+                 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+                && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
+                const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
+                ip0++;
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
+                break;
+            }
+
+            if (dictTagsMatch) {
+                /* Found a possible dict match */
+                const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
+                const BYTE* dictMatch = dictBase + dictMatchIndex;
+                if (dictMatchIndex > dictStartIndex &&
+                    MEM_read32(dictMatch) == MEM_read32(ip0)) {
+                    /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
+                    if (matchIndex <= prefixStartIndex) {
+                        U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
+                        mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
+                        while (((ip0 > anchor) & (dictMatch > dictStart))
+                            && (ip0[-1] == dictMatch[-1])) {
+                            ip0--;
+                            dictMatch--;
+                            mLength++;
+                        } /* catch up */
+                        offset_2 = offset_1;
+                        offset_1 = offset;
+                        ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                        break;
+                    }
+                }
+            }
+
+            if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
+                /* found a regular match */
+                U32 const offset = (U32) (ip0 - match);
+                mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
+                while (((ip0 > anchor) & (match > prefixStart))
+                       && (ip0[-1] == match[-1])) {
+                    ip0--;
+                    match--;
+                    mLength++;
                 } /* catch up */
                 offset_2 = offset_1;
                 offset_1 = offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
+                ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
+                break;
             }
-        } else if (MEM_read32(match) != MEM_read32(ip)) {
-            /* it's not a match, and we're not going to check the dictionary */
-            assert(stepSize >= 1);
-            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
-            continue;
-        } else {
-            /* found a regular match */
-            U32 const offset = (U32)(ip-match);
-            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
-            while (((ip>anchor) & (match>prefixStart))
-                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
-            offset_2 = offset_1;
-            offset_1 = offset;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
-        }
+
+            /* Prepare for next iteration */
+            dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
+            dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
+            matchIndex = hashTable[hash1];
+
+            if (ip1 >= nextStep) {
+                step++;
+                nextStep += kStepIncr;
+            }
+            ip0 = ip1;
+            ip1 = ip1 + step;
+            if (ip1 > ilimit) goto _cleanup;
+
+            curr = (U32)(ip0 - base);
+            hash0 = hash1;
+        }   /* end inner search loop */
 
         /* match found */
-        ip += mLength;
-        anchor = ip;
+        assert(mLength);
+        ip0 += mLength;
+        anchor = ip0;
 
-        if (ip <= ilimit) {
+        if (ip0 <= ilimit) {
             /* Fill Table */
             assert(base+curr+2 > istart);  /* check base overflow */
             hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;  /* here because curr+2 could be > iend-8 */
-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
 
             /* check immediate repcode */
-            while (ip <= ilimit) {
-                U32 const current2 = (U32)(ip-base);
+            while (ip0 <= ilimit) {
+                U32 const current2 = (U32)(ip0-base);
                 U32 const repIndex2 = current2 - offset_2;
                 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
                         dictBase - dictIndexDelta + repIndex2 :
                         base + repIndex2;
                 if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                   && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
                     const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
                     U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2);
-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
-                    ip += repLength2;
-                    anchor = ip;
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
+                    ip0 += repLength2;
+                    anchor = ip0;
                     continue;
                 }
                 break;
             }
         }
+
+        /* Prepare for next iteration */
+        assert(ip0 == anchor);
+        ip1 = ip0 + stepSize;
     }
 
+_cleanup:
     /* save reps for next block */
-    rep[0] = offset_1 ? offset_1 : offsetSaved;
-    rep[1] = offset_2 ? offset_2 : offsetSaved;
+    rep[0] = offset_1;
+    rep[1] = offset_2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
@@ -545,7 +688,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState(
 }
 
 
-static size_t ZSTD_compressBlock_fast_extDict_generic(
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_compressBlock_fast_extDict_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
 {
@@ -553,11 +698,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     U32* const hashTable = ms->hashTable;
     U32 const hlog = cParams->hashLog;
     /* support stepSize of 0 */
-    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
     const BYTE* const base = ms->window.base;
     const BYTE* const dictBase = ms->window.dictBase;
     const BYTE* const istart = (const BYTE*)src;
-    const BYTE* ip = istart;
     const BYTE* anchor = istart;
     const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
     const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
@@ -570,6 +714,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     const BYTE* const iend = istart + srcSize;
     const BYTE* const ilimit = iend - 8;
     U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
+
+    const BYTE* ip0 = istart;
+    const BYTE* ip1;
+    const BYTE* ip2;
+    const BYTE* ip3;
+    U32 current0;
+
+
+    size_t hash0; /* hash for ip0 */
+    size_t hash1; /* hash for ip1 */
+    U32 idx; /* match idx for ip0 */
+    const BYTE* idxBase; /* base pointer for idx */
+
+    U32 offcode;
+    const BYTE* match0;
+    size_t mLength;
+    const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
+
+    size_t step;
+    const BYTE* nextStep;
+    const size_t kStepIncr = (1 << (kSearchStrength - 1));
 
     (void)hasStep; /* not currently specialized on whether it's accelerated */
 
@@ -579,75 +745,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     if (prefixStartIndex == dictStartIndex)
         return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
 
-    /* Search Loop */
-    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
-        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
-        const U32    matchIndex = hashTable[h];
-        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
-        const BYTE*  match = matchBase + matchIndex;
-        const U32    curr = (U32)(ip-base);
-        const U32    repIndex = curr + 1 - offset_1;
-        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
-        const BYTE* const repMatch = repBase + repIndex;
-        hashTable[h] = curr;   /* update hash table */
-        DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr);
-
-        if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */
-             & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */
-           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
-            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
-            size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
-            ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength);
-            ip += rLength;
-            anchor = ip;
-        } else {
-            if ( (matchIndex < dictStartIndex) ||
-                 (MEM_read32(match) != MEM_read32(ip)) ) {
-                assert(stepSize >= 1);
-                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
-                continue;
+    {   U32 const curr = (U32)(ip0 - base);
+        U32 const maxRep = curr - dictStartIndex;
+        if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
+    }
+
+    /* start each op */
+_start: /* Requires: ip0 */
+
+    step = stepSize;
+    nextStep = ip0 + kStepIncr;
+
+    /* calculate positions, ip0 - anchor == 0, so we skip step calc */
+    ip1 = ip0 + 1;
+    ip2 = ip0 + step;
+    ip3 = ip2 + 1;
+
+    if (ip3 >= ilimit) {
+        goto _cleanup;
+    }
+
+    hash0 = ZSTD_hashPtr(ip0, hlog, mls);
+    hash1 = ZSTD_hashPtr(ip1, hlog, mls);
+
+    idx = hashTable[hash0];
+    idxBase = idx < prefixStartIndex ? dictBase : base;
+
+    do {
+        {   /* load repcode match for ip[2] */
+            U32 const current2 = (U32)(ip2 - base);
+            U32 const repIndex = current2 - offset_1;
+            const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+            U32 rval;
+            if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
+                 & (offset_1 > 0) ) {
+                rval = MEM_read32(repBase + repIndex);
+            } else {
+                rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
             }
-            {   const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
-                const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
-                U32 const offset = curr - matchIndex;
-                size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
-                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
-                offset_2 = offset_1; offset_1 = offset;  /* update offset history */
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength);
-                ip += mLength;
-                anchor = ip;
+
+            /* write back hash table entry */
+            current0 = (U32)(ip0 - base);
+            hashTable[hash0] = current0;
+
+            /* check repcode at ip[2] */
+            if (MEM_read32(ip2) == rval) {
+                ip0 = ip2;
+                match0 = repBase + repIndex;
+                matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+                assert((match0 != prefixStart) & (match0 != dictStart));
+                mLength = ip0[-1] == match0[-1];
+                ip0 -= mLength;
+                match0 -= mLength;
+                offcode = REPCODE1_TO_OFFBASE;
+                mLength += 4;
+                goto _match;
         }   }
 
-        if (ip <= ilimit) {
-            /* Fill Table */
-            hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2;
-            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
-            /* check immediate repcode */
-            while (ip <= ilimit) {
-                U32 const current2 = (U32)(ip-base);
-                U32 const repIndex2 = current2 - offset_2;
-                const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex))  /* intentional overflow */
-                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
-                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
-                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
-                    { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2);
-                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
-                    ip += repLength2;
-                    anchor = ip;
-                    continue;
-                }
-                break;
-    }   }   }
+        {   /* load match for ip[0] */
+            U32 const mval = idx >= dictStartIndex ?
+                    MEM_read32(idxBase + idx) :
+                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
+
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval) {
+                /* found a match! */
+                goto _offset;
+        }   }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+        idxBase = idx < prefixStartIndex ? dictBase : base;
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip3;
+
+        /* write back hash table entry */
+        current0 = (U32)(ip0 - base);
+        hashTable[hash0] = current0;
+
+        {   /* load match for ip[0] */
+            U32 const mval = idx >= dictStartIndex ?
+                    MEM_read32(idxBase + idx) :
+                    MEM_read32(ip0) ^ 1; /* guaranteed not to match */
+
+            /* check match at ip[0] */
+            if (MEM_read32(ip0) == mval) {
+                /* found a match! */
+                goto _offset;
+        }   }
+
+        /* lookup ip[1] */
+        idx = hashTable[hash1];
+        idxBase = idx < prefixStartIndex ? dictBase : base;
+
+        /* hash ip[2] */
+        hash0 = hash1;
+        hash1 = ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 = ip1;
+        ip1 = ip2;
+        ip2 = ip0 + step;
+        ip3 = ip1 + step;
+
+        /* calculate step */
+        if (ip2 >= nextStep) {
+            step++;
+            PREFETCH_L1(ip1 + 64);
+            PREFETCH_L1(ip1 + 128);
+            nextStep += kStepIncr;
+        }
+    } while (ip3 < ilimit);
+
+_cleanup:
+    /* Note that there are probably still a couple positions we could search.
+     * However, it seems to be a meaningful performance hit to try to search
+     * them. So let's not. */
+
+    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
 
     /* save reps for next block */
-    rep[0] = offset_1;
-    rep[1] = offset_2;
+    rep[0] = offset_1 ? offset_1 : offsetSaved1;
+    rep[1] = offset_2 ? offset_2 : offsetSaved2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
+
+_offset: /* Requires: ip0, idx, idxBase */
+
+    /* Compute the offset code. */
+    {   U32 const offset = current0 - idx;
+        const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
+        matchEnd = idx < prefixStartIndex ? dictEnd : iend;
+        match0 = idxBase + idx;
+        offset_2 = offset_1;
+        offset_1 = offset;
+        offcode = OFFSET_TO_OFFBASE(offset);
+        mLength = 4;
+
+        /* Count the backwards match length. */
+        while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
+            ip0--;
+            match0--;
+            mLength++;
+    }   }
+
+_match: /* Requires: ip0, match0, offcode, matchEnd */
+
+    /* Count the forward length. */
+    assert(matchEnd != 0);
+    mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
+
+    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
+
+    ip0 += mLength;
+    anchor = ip0;
+
+    /* write next hash table entry */
+    if (ip1 < ip0) {
+        hashTable[hash1] = (U32)(ip1 - base);
+    }
+
+    /* Fill table and check for immediate repcode. */
+    if (ip0 <= ilimit) {
+        /* Fill Table */
+        assert(base+current0+2 > istart);  /* check base overflow */
+        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
+        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+        while (ip0 <= ilimit) {
+            U32 const repIndex2 = (U32)(ip0-base) - offset_2;
+            const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+            if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0))  /* intentional underflow */
+                 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
+                const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+                ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
+                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                ip0 += repLength2;
+                anchor = ip0;
+                continue;
+            }
+            break;
+    }   }
+
+    goto _start;
 }
 
 ZSTD_GEN_FAST_FN(extDict, 4, 0)
@@ -660,6 +953,7 @@ size_t ZSTD_compressBlock_fast_extDict(
         void const* src, size_t srcSize)
 {
     U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState == NULL);
     switch(mls)
     {
     default: /* includes case 3 */
diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h
index fddc2f532d21..e64d9e1b2d39 100644
--- a/lib/zstd/compress/zstd_fast.h
+++ b/lib/zstd/compress/zstd_fast.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -16,7 +17,8 @@
 #include "zstd_compress_internal.h"
 
 void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
-                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm,
+                        ZSTD_tableFillPurpose_e tfp);
 size_t ZSTD_compressBlock_fast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
index 0298a01a7504..3e88d8a1a136 100644
--- a/lib/zstd/compress/zstd_lazy.c
+++ b/lib/zstd/compress/zstd_lazy.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -10,14 +11,23 @@
 
 #include "zstd_compress_internal.h"
 #include "zstd_lazy.h"
+#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
+
+#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
+
+#define kLazySkippingStep 8
 
 
 /*-*************************************
 *  Binary Tree search
 ***************************************/
 
-static void
-ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_updateDUBT(ZSTD_matchState_t* ms,
                 const BYTE* ip, const BYTE* iend,
                 U32 mls)
 {
@@ -60,8 +70,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
  *  sort one already inserted but unsorted position
  *  assumption : curr >= btlow == (curr - btmask)
  *  doesn't fail */
-static void
-ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
                  U32 curr, const BYTE* inputEnd,
                  U32 nbCompares, U32 btLow,
                  const ZSTD_dictMode_e dictMode)
@@ -149,8 +160,9 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
 }
 
 
-static size_t
-ZSTD_DUBT_findBetterDictMatch (
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_DUBT_findBetterDictMatch (
         const ZSTD_matchState_t* ms,
         const BYTE* const ip, const BYTE* const iend,
         size_t* offsetPtr,
@@ -197,8 +209,8 @@ ZSTD_DUBT_findBetterDictMatch (
             U32 matchIndex = dictMatchIndex + dictIndexDelta;
             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
                 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
-                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
-                bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
+                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
+                bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
             }
             if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
                 break;   /* drop, to guarantee consistency (miss a little bit of compression) */
@@ -218,7 +230,7 @@ ZSTD_DUBT_findBetterDictMatch (
     }
 
     if (bestLength >= MINMATCH) {
-        U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
+        U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex;
         DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
                     curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
     }
@@ -227,10 +239,11 @@ ZSTD_DUBT_findBetterDictMatch (
 }
 
 
-static size_t
-ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
                         const BYTE* const ip, const BYTE* const iend,
-                        size_t* offsetPtr,
+                        size_t* offBasePtr,
                         U32 const mls,
                         const ZSTD_dictMode_e dictMode)
 {
@@ -327,8 +340,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
             if (matchLength > bestLength) {
                 if (matchLength > matchEndIdx - matchIndex)
                     matchEndIdx = matchIndex + (U32)matchLength;
-                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
-                    bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex);
+                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) )
+                    bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
                 if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
                     if (dictMode == ZSTD_dictMatchState) {
                         nbCompares = 0; /* in addition to avoiding checking any
@@ -361,16 +374,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
         if (dictMode == ZSTD_dictMatchState && nbCompares) {
             bestLength = ZSTD_DUBT_findBetterDictMatch(
                     ms, ip, iend,
-                    offsetPtr, bestLength, nbCompares,
+                    offBasePtr, bestLength, nbCompares,
                     mls, dictMode);
         }
 
         assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */
         ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
         if (bestLength >= MINMATCH) {
-            U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex;
+            U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex;
             DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
-                        curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
+                        curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
         }
         return bestLength;
     }
@@ -378,17 +391,18 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
 
 
 /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */
-FORCE_INLINE_TEMPLATE size_t
-ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
                 const BYTE* const ip, const BYTE* const iLimit,
-                      size_t* offsetPtr,
+                      size_t* offBasePtr,
                 const U32 mls /* template */,
                 const ZSTD_dictMode_e dictMode)
 {
     DEBUGLOG(7, "ZSTD_BtFindBestMatch");
     if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
     ZSTD_updateDUBT(ms, ip, iLimit, mls);
-    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
+    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
 }
 
 /* *********************************
@@ -561,7 +575,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
         /* save best solution */
         if (currentMl > ml) {
             ml = currentMl;
-            *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
+            *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
             if (ip+currentMl == iLimit) {
                 /* best possible, avoids read overflow on next attempt */
                 return ml;
@@ -598,7 +612,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
             /* save best solution */
             if (currentMl > ml) {
                 ml = currentMl;
-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta));
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
             }
         }
@@ -614,10 +628,12 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb
 
 /* Update chains up to ip (excluded)
    Assumption : always within prefix (i.e. not within extDict) */
-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_insertAndFindFirstIndex_internal(
                         ZSTD_matchState_t* ms,
                         const ZSTD_compressionParameters* const cParams,
-                        const BYTE* ip, U32 const mls)
+                        const BYTE* ip, U32 const mls, U32 const lazySkipping)
 {
     U32* const hashTable  = ms->hashTable;
     const U32 hashLog = cParams->hashLog;
@@ -632,6 +648,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
         NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
         hashTable[h] = idx;
         idx++;
+        /* Stop inserting every position when in the lazy skipping mode. */
+        if (lazySkipping)
+            break;
     }
 
     ms->nextToUpdate = target;
@@ -640,11 +659,12 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
 
 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0);
 }
 
 /* inlining is important to hardwire a hot branch (template emulation) */
 FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
 size_t ZSTD_HcFindBestMatch(
                         ZSTD_matchState_t* ms,
                         const BYTE* const ip, const BYTE* const iLimit,
@@ -684,14 +704,15 @@ size_t ZSTD_HcFindBestMatch(
     }
 
     /* HC4 match finder */
-    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
+    matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
 
     for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) {
         size_t currentMl=0;
         if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
             const BYTE* const match = base + matchIndex;
             assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
-            if (match[ml] == ip[ml])   /* potentially better */
+            /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
+            if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
                 currentMl = ZSTD_count(ip, match, iLimit);
         } else {
             const BYTE* const match = dictBase + matchIndex;
@@ -703,7 +724,7 @@ size_t ZSTD_HcFindBestMatch(
         /* save best solution */
         if (currentMl > ml) {
             ml = currentMl;
-            *offsetPtr = STORE_OFFSET(curr - matchIndex);
+            *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
             if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
         }
 
@@ -739,7 +760,7 @@ size_t ZSTD_HcFindBestMatch(
             if (currentMl > ml) {
                 ml = currentMl;
                 assert(curr > matchIndex + dmsIndexDelta);
-                *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
             }
 
@@ -756,8 +777,6 @@ size_t ZSTD_HcFindBestMatch(
 * (SIMD) Row-based matchfinder
 ***********************************/
 /* Constants for row-based hash */
-#define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the match state's tagTable from the beginning of a row */
-#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
 #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
 #define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entries per row, for all configurations */
 
@@ -769,64 +788,19 @@ typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U64 repr
  * Starting from the LSB, returns the idx of the next non-zero bit.
  * Basically counting the nb of trailing zeroes.
  */
-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
-    assert(val != 0);
-#   if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))
-    if (sizeof(size_t) == 4) {
-        U32 mostSignificantWord = (U32)(val >> 32);
-        U32 leastSignificantWord = (U32)val;
-        if (leastSignificantWord == 0) {
-            return 32 + (U32)__builtin_ctz(mostSignificantWord);
-        } else {
-            return (U32)__builtin_ctz(leastSignificantWord);
-        }
-    } else {
-        return (U32)__builtin_ctzll(val);
-    }
-#   else
-    /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count
-     * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer
-     */
-    val = ~val & (val - 1ULL); /* Lowest set bit mask */
-    val = val - ((val >> 1) & 0x5555555555555555);
-    val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL);
-    return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56);
-#   endif
-}
-
-/* ZSTD_rotateRight_*():
- * Rotates a bitfield to the right by "count" bits.
- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts
- */
-FORCE_INLINE_TEMPLATE
-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
-    assert(count < 64);
-    count &= 0x3F; /* for fickle pattern recognition */
-    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
-}
-
-FORCE_INLINE_TEMPLATE
-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
-    assert(count < 32);
-    count &= 0x1F; /* for fickle pattern recognition */
-    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
-}
-
-FORCE_INLINE_TEMPLATE
-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
-    assert(count < 16);
-    count &= 0x0F; /* for fickle pattern recognition */
-    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
+MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+    return ZSTD_countTrailingZeros64(val);
 }
 
 /* ZSTD_row_nextIndex():
  * Returns the next index to insert at within a tagTable row, and updates the "head"
- * value to reflect the update. Essentially cycles backwards from [0, {entries per row})
+ * value to reflect the update. Essentially cycles backwards from [1, {entries per row})
  */
 FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
-  U32 const next = (*tagRow - 1) & rowMask;
-  *tagRow = (BYTE)next;
-  return next;
+    U32 next = (*tagRow-1) & rowMask;
+    next += (next == 0) ? rowMask : 0; /* skip first position */
+    *tagRow = (BYTE)next;
+    return next;
 }
 
 /* ZSTD_isAligned():
@@ -840,7 +814,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
 /* ZSTD_row_prefetch():
  * Performs prefetching for the hashTable and tagTable at a given row.
  */
-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) {
+FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
     PREFETCH_L1(hashTable + relRow);
     if (rowLog >= 5) {
         PREFETCH_L1(hashTable + relRow + 16);
@@ -859,18 +833,20 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* ta
  * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
  * but not beyond iLimit.
  */
-FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
                                    U32 const rowLog, U32 const mls,
                                    U32 idx, const BYTE* const iLimit)
 {
     U32 const* const hashTable = ms->hashTable;
-    U16 const* const tagTable = ms->tagTable;
+    BYTE const* const tagTable = ms->tagTable;
     U32 const hashLog = ms->rowHashLog;
     U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1);
     U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
 
     for (; idx < lim; ++idx) {
-        U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
         U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
         ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
@@ -885,12 +861,15 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B
  * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
  * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
  */
-FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
-                                                  U16 const* tagTable, BYTE const* base,
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
+                                                  BYTE const* tagTable, BYTE const* base,
                                                   U32 idx, U32 const hashLog,
-                                                  U32 const rowLog, U32 const mls)
+                                                  U32 const rowLog, U32 const mls,
+                                                  U64 const hashSalt)
 {
-    U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+    U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
     U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
     ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
     {   U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
@@ -902,28 +881,29 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab
 /* ZSTD_row_update_internalImpl():
  * Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
  */
-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
-                                                        U32 updateStartIdx, U32 const updateEndIdx,
-                                                        U32 const mls, U32 const rowLog,
-                                                        U32 const rowMask, U32 const useCache)
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
+                                  U32 updateStartIdx, U32 const updateEndIdx,
+                                  U32 const mls, U32 const rowLog,
+                                  U32 const rowMask, U32 const useCache)
 {
     U32* const hashTable = ms->hashTable;
-    U16* const tagTable = ms->tagTable;
+    BYTE* const tagTable = ms->tagTable;
     U32 const hashLog = ms->rowHashLog;
     const BYTE* const base = ms->window.base;
 
     DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
     for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
-        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
-                                  : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
+                                  : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         U32* const row = hashTable + relRow;
-        BYTE* tagRow = (BYTE*)(tagTable + relRow);  /* Though tagTable is laid out as a table of U16, each tag is only 1 byte.
-                                                       Explicit cast allows us to get exact desired position within each row */
+        BYTE* tagRow = tagTable + relRow;
         U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
 
-        assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls));
-        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK;
+        assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
+        tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
         row[pos] = updateStartIdx;
     }
 }
@@ -932,9 +912,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
  * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
  * Skips sections of long matches as is necessary.
  */
-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
-                                                    U32 const mls, U32 const rowLog,
-                                                    U32 const rowMask, U32 const useCache)
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
+                              U32 const mls, U32 const rowLog,
+                              U32 const rowMask, U32 const useCache)
 {
     U32 idx = ms->nextToUpdate;
     const BYTE* const base = ms->window.base;
@@ -971,7 +953,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
     const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
 
     DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog);
-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */);
+    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */);
+}
+
+/* Returns the mask width of bits group of which will be set to 1. Given not all
+ * architectures have easy movemask instruction, this helps to iterate over
+ * groups of bits easier and faster.
+ */
+FORCE_INLINE_TEMPLATE U32
+ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
+{
+    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+    assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
+    (void)rowEntries;
+#if defined(ZSTD_ARCH_ARM_NEON)
+    /* NEON path only works for little endian */
+    if (!MEM_isLittleEndian()) {
+        return 1;
+    }
+    if (rowEntries == 16) {
+        return 4;
+    }
+    if (rowEntries == 32) {
+        return 2;
+    }
+    if (rowEntries == 64) {
+        return 1;
+    }
+#endif
+    return 1;
 }
 
 #if defined(ZSTD_ARCH_X86_SSE2)
@@ -994,71 +1004,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U
 }
 #endif
 
-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches
- * the hash at the nth position in a row of the tagTable.
- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield
- * to match up with the actual layout of the entries within the hashTable */
+#if defined(ZSTD_ARCH_ARM_NEON)
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
+{
+    assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
+    if (rowEntries == 16) {
+        /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.
+         * After that groups of 4 bits represent the equalMask. We lower
+         * all bits except the highest in these groups by doing AND with
+         * 0x88 = 0b10001000.
+         */
+        const uint8x16_t chunk = vld1q_u8(src);
+        const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
+        const uint8x8_t res = vshrn_n_u16(equalMask, 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0);
+        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull;
+    } else if (rowEntries == 32) {
+        /* Same idea as with rowEntries == 16 but doing AND with
+         * 0x55 = 0b01010101.
+         */
+        const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src);
+        const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
+        const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
+        const uint8x16_t dup = vdupq_n_u8(tag);
+        const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6);
+        const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6);
+        const uint8x8_t res = vsli_n_u8(t0, t1, 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ;
+        return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull;
+    } else { /* rowEntries == 64 */
+        const uint8x16x4_t chunk = vld4q_u8(src);
+        const uint8x16_t dup = vdupq_n_u8(tag);
+        const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
+        const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
+        const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
+        const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
+
+        const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
+        const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
+        const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
+        const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
+        const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
+        const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
+        return ZSTD_rotateRight_U64(matches, headGrouped);
+    }
+}
+#endif
+
+/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
+ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
+ * matches the hash at the nth position in a row of the tagTable.
+ * Each row is a circular buffer beginning at the value of "headGrouped". So we
+ * must rotate the "matches" bitfield to match up with the actual layout of the
+ * entries within the hashTable */
 FORCE_INLINE_TEMPLATE ZSTD_VecMask
-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries)
+ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
 {
-    const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
+    const BYTE* const src = tagRow;
     assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64);
     assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
+    assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8);
 
 #if defined(ZSTD_ARCH_X86_SSE2)
 
-    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
+    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
 
 #else /* SW or NEON-LE */
 
 # if defined(ZSTD_ARCH_ARM_NEON)
   /* This NEON path only works for little endian - otherwise use SWAR below */
     if (MEM_isLittleEndian()) {
-        if (rowEntries == 16) {
-            const uint8x16_t chunk = vld1q_u8(src);
-            const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
-            const uint16x8_t t0 = vshlq_n_u16(equalMask, 7);
-            const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14));
-            const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14));
-            const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28));
-            const U16 hi = (U16)vgetq_lane_u8(t3, 8);
-            const U16 lo = (U16)vgetq_lane_u8(t3, 0);
-            return ZSTD_rotateRight_U16((hi << 8) | lo, head);
-        } else if (rowEntries == 32) {
-            const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src);
-            const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]);
-            const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]);
-            const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag));
-            const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag));
-            const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0));
-            const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1));
-            const uint8x8_t t0 = vreinterpret_u8_s8(pack0);
-            const uint8x8_t t1 = vreinterpret_u8_s8(pack1);
-            const uint8x8_t t2 = vsri_n_u8(t1, t0, 2);
-            const uint8x8x2_t t3 = vuzp_u8(t2, t0);
-            const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4);
-            const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0);
-            return ZSTD_rotateRight_U32(matches, head);
-        } else { /* rowEntries == 64 */
-            const uint8x16x4_t chunk = vld4q_u8(src);
-            const uint8x16_t dup = vdupq_n_u8(tag);
-            const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup);
-            const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup);
-            const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup);
-            const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup);
-
-            const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1);
-            const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1);
-            const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2);
-            const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4);
-            const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4);
-            const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0);
-            return ZSTD_rotateRight_U64(matches, head);
-        }
+        return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
     }
 # endif /* ZSTD_ARCH_ARM_NEON */
     /* SWAR */
-    {   const size_t chunkSize = sizeof(size_t);
+    {   const int chunkSize = sizeof(size_t);
         const size_t shiftAmount = ((chunkSize * 8) - chunkSize);
         const size_t xFF = ~((size_t)0);
         const size_t x01 = xFF / 0xFF;
@@ -1091,11 +1112,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
         }
         matches = ~matches;
         if (rowEntries == 16) {
-            return ZSTD_rotateRight_U16((U16)matches, head);
+            return ZSTD_rotateRight_U16((U16)matches, headGrouped);
         } else if (rowEntries == 32) {
-            return ZSTD_rotateRight_U32((U32)matches, head);
+            return ZSTD_rotateRight_U32((U32)matches, headGrouped);
         } else {
-            return ZSTD_rotateRight_U64((U64)matches, head);
+            return ZSTD_rotateRight_U64((U64)matches, headGrouped);
         }
     }
 #endif
@@ -1103,20 +1124,21 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head,
 
 /* The high-level approach of the SIMD row based match finder is as follows:
  * - Figure out where to insert the new entry:
- *      - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
- *      - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
+ *      - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index.
+ *           - The hash is salted by a value that changes on every contex reset, so when the same table is used
+ *             we will avoid collisions that would otherwise slow us down by intorducing phantom matches.
+ *      - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines
  *        which row to insert into.
- *      - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
- *        be considered as a circular buffer with a "head" index that resides in the tagTable.
- *      - Also insert the "tag" into the equivalent row and position in the tagTable.
- *          - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
- *                  The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
- *                  for alignment/performance reasons, leaving some bytes unused.
- * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
+ *      - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can
+ *        be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes
+ *        per row).
+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and
  *   generate a bitfield that we can cycle through to check the collisions in the hash table.
  * - Pick the longest match.
+ * - Insert the tag into the equivalent row and position in the tagTable.
  */
 FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
 size_t ZSTD_RowFindBestMatch(
                         ZSTD_matchState_t* ms,
                         const BYTE* const ip, const BYTE* const iLimit,
@@ -1125,7 +1147,7 @@ size_t ZSTD_RowFindBestMatch(
                         const U32 rowLog)
 {
     U32* const hashTable = ms->hashTable;
-    U16* const tagTable = ms->tagTable;
+    BYTE* const tagTable = ms->tagTable;
     U32* const hashCache = ms->hashCache;
     const U32 hashLog = ms->rowHashLog;
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
@@ -1143,8 +1165,11 @@ size_t ZSTD_RowFindBestMatch(
     const U32 rowEntries = (1U << rowLog);
     const U32 rowMask = rowEntries - 1;
     const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */
+    const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
+    const U64 hashSalt = ms->hashSalt;
     U32 nbAttempts = 1U << cappedSearchLog;
     size_t ml=4-1;
+    U32 hash;
 
     /* DMS/DDS variables that may be referenced laster */
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
@@ -1168,7 +1193,7 @@ size_t ZSTD_RowFindBestMatch(
     if (dictMode == ZSTD_dictMatchState) {
         /* Prefetch DMS rows */
         U32* const dmsHashTable = dms->hashTable;
-        U16* const dmsTagTable = dms->tagTable;
+        BYTE* const dmsTagTable = dms->tagTable;
         U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
         U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
@@ -1178,23 +1203,34 @@ size_t ZSTD_RowFindBestMatch(
     }
 
     /* Update the hashTable and tagTable up to (but not including) ip */
-    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
+    if (!ms->lazySkipping) {
+        ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */);
+        hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
+    } else {
+        /* Stop inserting every position when in the lazy skipping mode.
+         * The hash cache is also not kept up to date in this mode.
+         */
+        hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
+        ms->nextToUpdate = curr;
+    }
+    ms->hashSaltEntropy += hash; /* collect salt entropy */
+
     {   /* Get the hash for ip, compute the appropriate row */
-        U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls);
         U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
         U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
         U32* const row = hashTable + relRow;
         BYTE* tagRow = (BYTE*)(tagTable + relRow);
-        U32 const head = *tagRow & rowMask;
+        U32 const headGrouped = (*tagRow & rowMask) * groupWidth;
         U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
         size_t numMatches = 0;
         size_t currMatch = 0;
-        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries);
+        ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
 
         /* Cycle through the matches and prefetch */
-        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
-            U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
+        for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
+            U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
             U32 const matchIndex = row[matchPos];
+            if(matchPos == 0) continue;
             assert(numMatches < rowEntries);
             if (matchIndex < lowLimit)
                 break;
@@ -1204,13 +1240,14 @@ size_t ZSTD_RowFindBestMatch(
                 PREFETCH_L1(dictBase + matchIndex);
             }
             matchBuffer[numMatches++] = matchIndex;
+            --nbAttempts;
         }
 
         /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop
            in ZSTD_row_update_internal() at the next search. */
         {
             U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
-            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag;
+            tagRow[pos] = (BYTE)tag;
             row[pos] = ms->nextToUpdate++;
         }
 
@@ -1224,7 +1261,8 @@ size_t ZSTD_RowFindBestMatch(
             if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
                 const BYTE* const match = base + matchIndex;
                 assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
-                if (match[ml] == ip[ml])   /* potentially better */
+                /* read 4B starting from (match + ml + 1 - sizeof(U32)) */
+                if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3))   /* potentially better */
                     currentMl = ZSTD_count(ip, match, iLimit);
             } else {
                 const BYTE* const match = dictBase + matchIndex;
@@ -1236,7 +1274,7 @@ size_t ZSTD_RowFindBestMatch(
             /* Save best solution */
             if (currentMl > ml) {
                 ml = currentMl;
-                *offsetPtr = STORE_OFFSET(curr - matchIndex);
+                *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
                 if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
             }
         }
@@ -1254,19 +1292,21 @@ size_t ZSTD_RowFindBestMatch(
         const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
         const U32 dmsIndexDelta        = dictLimit - dmsSize;
 
-        {   U32 const head = *dmsTagRow & rowMask;
+        {   U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth;
             U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
             size_t numMatches = 0;
             size_t currMatch = 0;
-            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries);
+            ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
 
-            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) {
-                U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask;
+            for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) {
+                U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
                 U32 const matchIndex = dmsRow[matchPos];
+                if(matchPos == 0) continue;
                 if (matchIndex < dmsLowestIndex)
                     break;
                 PREFETCH_L1(dmsBase + matchIndex);
                 matchBuffer[numMatches++] = matchIndex;
+                --nbAttempts;
             }
 
             /* Return the longest match */
@@ -1285,7 +1325,7 @@ size_t ZSTD_RowFindBestMatch(
                 if (currentMl > ml) {
                     ml = currentMl;
                     assert(curr > matchIndex + dmsIndexDelta);
-                    *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta));
+                    *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
                     if (ip+currentMl == iLimit) break;
                 }
             }
@@ -1472,8 +1512,9 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
 *  Common parser - lazy strategy
 *********************************/
 
-FORCE_INLINE_TEMPLATE size_t
-ZSTD_compressBlock_lazy_generic(
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_compressBlock_lazy_generic(
                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
                         U32 rep[ZSTD_REP_NUM],
                         const void* src, size_t srcSize,
@@ -1491,7 +1532,8 @@ ZSTD_compressBlock_lazy_generic(
     const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6);
     const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6);
 
-    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
+    U32 offset_1 = rep[0], offset_2 = rep[1];
+    U32 offsetSaved1 = 0, offsetSaved2 = 0;
 
     const int isDMS = dictMode == ZSTD_dictMatchState;
     const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
@@ -1512,8 +1554,8 @@ ZSTD_compressBlock_lazy_generic(
         U32 const curr = (U32)(ip - base);
         U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
         U32 const maxRep = curr - windowLow;
-        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
-        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
+        if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0;
     }
     if (isDxS) {
         /* dictMatchState repCode checks don't currently handle repCode == 0
@@ -1522,10 +1564,11 @@ ZSTD_compressBlock_lazy_generic(
         assert(offset_2 <= dictAndPrefixLength);
     }
 
+    /* Reset the lazy skipping state */
+    ms->lazySkipping = 0;
+
     if (searchMethod == search_rowHash) {
-        ZSTD_row_fillHashCache(ms, base, rowLog,
-                            MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
-                            ms->nextToUpdate, ilimit);
+        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
     }
 
     /* Match Loop */
@@ -1537,7 +1580,7 @@ ZSTD_compressBlock_lazy_generic(
 #endif
     while (ip < ilimit) {
         size_t matchLength=0;
-        size_t offcode=STORE_REPCODE_1;
+        size_t offBase = REPCODE1_TO_OFFBASE;
         const BYTE* start=ip+1;
         DEBUGLOG(7, "search baseline (depth 0)");
 
@@ -1562,14 +1605,23 @@ ZSTD_compressBlock_lazy_generic(
         }
 
         /* first search (depth 0) */
-        {   size_t offsetFound = 999999999;
-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode);
+        {   size_t offbaseFound = 999999999;
+            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
             if (ml2 > matchLength)
-                matchLength = ml2, start = ip, offcode=offsetFound;
+                matchLength = ml2, start = ip, offBase = offbaseFound;
         }
 
         if (matchLength < 4) {
-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */;
+            ip += step;
+            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
+             * In this mode we stop inserting every position into our tables, and only insert
+             * positions that we search, which is one in step positions.
+             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
+             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
+             * triggered once we've gone 2KB without finding any matches.
+             */
+            ms->lazySkipping = step > kLazySkippingStep;
             continue;
         }
 
@@ -1579,12 +1631,12 @@ ZSTD_compressBlock_lazy_generic(
             DEBUGLOG(7, "search depth 1");
             ip ++;
             if ( (dictMode == ZSTD_noDict)
-              && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+              && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
                 size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
                 int const gain2 = (int)(mlRep * 3);
-                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
                 if ((mlRep >= 4) && (gain2 > gain1))
-                    matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                    matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
             }
             if (isDxS) {
                 const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1596,17 +1648,17 @@ ZSTD_compressBlock_lazy_generic(
                     const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
                     size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
                     int const gain2 = (int)(mlRep * 3);
-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
                     if ((mlRep >= 4) && (gain2 > gain1))
-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
                 }
             }
-            {   size_t offset2=999999999;
-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
+            {   size_t ofbCandidate=999999999;
+                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
-                    matchLength = ml2, offcode = offset2, start = ip;
+                    matchLength = ml2, offBase = ofbCandidate, start = ip;
                     continue;   /* search a better one */
             }   }
 
@@ -1615,12 +1667,12 @@ ZSTD_compressBlock_lazy_generic(
                 DEBUGLOG(7, "search depth 2");
                 ip ++;
                 if ( (dictMode == ZSTD_noDict)
-                  && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                  && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
                     size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
                     int const gain2 = (int)(mlRep * 4);
-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
                     if ((mlRep >= 4) && (gain2 > gain1))
-                        matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                        matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
                 }
                 if (isDxS) {
                     const U32 repIndex = (U32)(ip - base) - offset_1;
@@ -1632,17 +1684,17 @@ ZSTD_compressBlock_lazy_generic(
                         const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
                         size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
                         int const gain2 = (int)(mlRep * 4);
-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
                         if ((mlRep >= 4) && (gain2 > gain1))
-                            matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip;
+                            matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
                     }
                 }
-                {   size_t offset2=999999999;
-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode);
-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
+                {   size_t ofbCandidate=999999999;
+                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
-                        matchLength = ml2, offcode = offset2, start = ip;
+                        matchLength = ml2, offBase = ofbCandidate, start = ip;
                         continue;
             }   }   }
             break;  /* nothing found : store previous solution */
@@ -1653,26 +1705,33 @@ ZSTD_compressBlock_lazy_generic(
          * notably if `value` is unsigned, resulting in a large positive `-value`.
          */
         /* catch up */
-        if (STORED_IS_OFFSET(offcode)) {
+        if (OFFBASE_IS_OFFSET(offBase)) {
             if (dictMode == ZSTD_noDict) {
-                while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest))
-                     && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) )  /* only search for offset within prefix */
+                while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
+                     && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) )  /* only search for offset within prefix */
                     { start--; matchLength++; }
             }
             if (isDxS) {
-                U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
+                U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
                 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
                 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
                 while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
             }
-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
+            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
         }
         /* store sequence */
 _storeSequence:
         {   size_t const litLength = (size_t)(start - anchor);
-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
             anchor = ip = start + matchLength;
         }
+        if (ms->lazySkipping) {
+            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
+            if (searchMethod == search_rowHash) {
+                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+            }
+            ms->lazySkipping = 0;
+        }
 
         /* check immediate repcode */
         if (isDxS) {
@@ -1686,8 +1745,8 @@ ZSTD_compressBlock_lazy_generic(
                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
                     const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
                     matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
-                    offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset_2 <=> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
+                    offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
                     ip += matchLength;
                     anchor = ip;
                     continue;
@@ -1701,166 +1760,181 @@ ZSTD_compressBlock_lazy_generic(
                  && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
                 /* store sequence */
                 matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
+                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
                 ip += matchLength;
                 anchor = ip;
                 continue;   /* faster when present ... (?) */
     }   }   }
 
-    /* Save reps for next block */
-    rep[0] = offset_1 ? offset_1 : savedOffset;
-    rep[1] = offset_2 ? offset_2 : savedOffset;
+    /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
+     * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
+    offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved1;
+    rep[1] = offset_2 ? offset_2 : offsetSaved2;
 
     /* Return the last literals size */
     return (size_t)(iend - anchor);
 }
+#endif /* build exclusions */
 
 
-size_t ZSTD_compressBlock_btlazy2(
+#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_greedy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
 }
 
-size_t ZSTD_compressBlock_lazy2(
+size_t ZSTD_compressBlock_greedy_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
 }
 
-size_t ZSTD_compressBlock_lazy(
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
 }
 
-size_t ZSTD_compressBlock_greedy(
+size_t ZSTD_compressBlock_greedy_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
 }
 
-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+size_t ZSTD_compressBlock_greedy_dictMatchState_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
 }
 
-size_t ZSTD_compressBlock_lazy2_dictMatchState(
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
 }
+#endif
 
-size_t ZSTD_compressBlock_lazy_dictMatchState(
+#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_lazy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
 }
 
-size_t ZSTD_compressBlock_greedy_dictMatchState(
+size_t ZSTD_compressBlock_lazy_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
 }
 
-
-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
 }
 
-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+size_t ZSTD_compressBlock_lazy_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
 }
 
-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+size_t ZSTD_compressBlock_lazy_dictMatchState_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
 }
 
-/* Row-based matchfinder */
-size_t ZSTD_compressBlock_lazy2_row(
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
 }
+#endif
 
-size_t ZSTD_compressBlock_lazy_row(
+#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_lazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
 }
 
-size_t ZSTD_compressBlock_greedy_row(
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
 }
 
-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch);
 }
 
-size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+size_t ZSTD_compressBlock_lazy2_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict);
 }
 
-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState);
 }
 
-
 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch);
 }
+#endif
 
-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btlazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
 }
 
-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch);
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
 }
+#endif
 
+#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
 FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
 size_t ZSTD_compressBlock_lazy_extDict_generic(
                         ZSTD_matchState_t* ms, seqStore_t* seqStore,
                         U32 rep[ZSTD_REP_NUM],
@@ -1886,12 +1960,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
 
     DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
 
+    /* Reset the lazy skipping state */
+    ms->lazySkipping = 0;
+
     /* init */
     ip += (ip == prefixStart);
     if (searchMethod == search_rowHash) {
-        ZSTD_row_fillHashCache(ms, base, rowLog,
-                               MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */),
-                               ms->nextToUpdate, ilimit);
+        ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
     }
 
     /* Match Loop */
@@ -1903,7 +1978,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
 #endif
     while (ip < ilimit) {
         size_t matchLength=0;
-        size_t offcode=STORE_REPCODE_1;
+        size_t offBase = REPCODE1_TO_OFFBASE;
         const BYTE* start=ip+1;
         U32 curr = (U32)(ip-base);
 
@@ -1922,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
         }   }
 
         /* first search (depth 0) */
-        {   size_t offsetFound = 999999999;
-            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict);
+        {   size_t ofbCandidate = 999999999;
+            size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
             if (ml2 > matchLength)
-                matchLength = ml2, start = ip, offcode=offsetFound;
+                matchLength = ml2, start = ip, offBase = ofbCandidate;
         }
 
         if (matchLength < 4) {
-            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
+            ip += step + 1;   /* jump faster over incompressible sections */
+            /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.
+             * In this mode we stop inserting every position into our tables, and only insert
+             * positions that we search, which is one in step positions.
+             * The exact cutoff is flexible, I've just chosen a number that is reasonably high,
+             * so we minimize the compression ratio loss in "normal" scenarios. This mode gets
+             * triggered once we've gone 2KB without finding any matches.
+             */
+            ms->lazySkipping = step > kLazySkippingStep;
             continue;
         }
 
@@ -1939,7 +2023,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
             ip ++;
             curr++;
             /* check repCode */
-            if (offcode) {
+            if (offBase) {
                 const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
                 const U32 repIndex = (U32)(curr - offset_1);
                 const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1951,18 +2035,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                     const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                     size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                     int const gain2 = (int)(repLength * 3);
-                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1);
                     if ((repLength >= 4) && (gain2 > gain1))
-                        matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
+                        matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
             }   }
 
             /* search match, depth 1 */
-            {   size_t offset2=999999999;
-                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
-                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4);
+            {   size_t ofbCandidate = 999999999;
+                size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4);
                 if ((ml2 >= 4) && (gain2 > gain1)) {
-                    matchLength = ml2, offcode = offset2, start = ip;
+                    matchLength = ml2, offBase = ofbCandidate, start = ip;
                     continue;   /* search a better one */
             }   }
 
@@ -1971,7 +2055,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                 ip ++;
                 curr++;
                 /* check repCode */
-                if (offcode) {
+                if (offBase) {
                     const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
                     const U32 repIndex = (U32)(curr - offset_1);
                     const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
@@ -1983,38 +2067,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                         const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                         size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                         int const gain2 = (int)(repLength * 4);
-                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1);
                         if ((repLength >= 4) && (gain2 > gain1))
-                            matchLength = repLength, offcode = STORE_REPCODE_1, start = ip;
+                            matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
                 }   }
 
                 /* search match, depth 2 */
-                {   size_t offset2=999999999;
-                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict);
-                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2)));   /* raw approx */
-                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7);
+                {   size_t ofbCandidate = 999999999;
+                    size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7);
                     if ((ml2 >= 4) && (gain2 > gain1)) {
-                        matchLength = ml2, offcode = offset2, start = ip;
+                        matchLength = ml2, offBase = ofbCandidate, start = ip;
                         continue;
             }   }   }
             break;  /* nothing found : store previous solution */
         }
 
         /* catch up */
-        if (STORED_IS_OFFSET(offcode)) {
-            U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode));
+        if (OFFBASE_IS_OFFSET(offBase)) {
+            U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
             const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
             const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
             while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
-            offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode);
+            offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
         }
 
         /* store sequence */
 _storeSequence:
         {   size_t const litLength = (size_t)(start - anchor);
-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength);
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
             anchor = ip = start + matchLength;
         }
+        if (ms->lazySkipping) {
+            /* We've found a match, disable lazy skipping mode, and refill the hash cache. */
+            if (searchMethod == search_rowHash) {
+                ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
+            }
+            ms->lazySkipping = 0;
+        }
 
         /* check immediate repcode */
         while (ip <= ilimit) {
@@ -2029,8 +2120,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                 /* repcode detected we should take it */
                 const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
                 matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
-                offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode;   /* swap offset history */
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength);
+                offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase;   /* swap offset history */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
                 ip += matchLength;
                 anchor = ip;
                 continue;   /* faster when present ... (?) */
@@ -2045,8 +2136,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
     /* Return the last literals size */
     return (size_t)(iend - anchor);
 }
+#endif /* build exclusions */
 
-
+#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
 size_t ZSTD_compressBlock_greedy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
@@ -2054,49 +2146,55 @@ size_t ZSTD_compressBlock_greedy_extDict(
     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
 }
 
-size_t ZSTD_compressBlock_lazy_extDict(
+size_t ZSTD_compressBlock_greedy_extDict_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
-
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
 }
+#endif
 
-size_t ZSTD_compressBlock_lazy2_extDict(
+#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_lazy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
 }
 
-size_t ZSTD_compressBlock_btlazy2_extDict(
+size_t ZSTD_compressBlock_lazy_extDict_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
 }
+#endif
 
-size_t ZSTD_compressBlock_greedy_extDict_row(
+#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_lazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
+
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
 }
 
-size_t ZSTD_compressBlock_lazy_extDict_row(
+size_t ZSTD_compressBlock_lazy2_extDict_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
-
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
 }
+#endif
 
-size_t ZSTD_compressBlock_lazy2_extDict_row(
+#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btlazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
 
 {
-    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
 }
+#endif
diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
index e5bdf4df8dde..22c9201f4e63 100644
--- a/lib/zstd/compress/zstd_lazy.h
+++ b/lib/zstd/compress/zstd_lazy.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -22,98 +23,175 @@
  */
 #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
 
+#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+
+#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR)
 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
 void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
 
 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip);
 
 void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */
+#endif
 
-size_t ZSTD_compressBlock_btlazy2(
+#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_greedy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2(
+size_t ZSTD_compressBlock_greedy_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy(
+size_t ZSTD_compressBlock_greedy_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy(
+size_t ZSTD_compressBlock_greedy_dictMatchState_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2_row(
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_row(
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy_row(
+size_t ZSTD_compressBlock_greedy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-
-size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+size_t ZSTD_compressBlock_greedy_extDict_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2_dictMatchState(
+
+#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy
+#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row
+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState
+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row
+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch
+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row
+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict
+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row
+#else
+#define ZSTD_COMPRESSBLOCK_GREEDY NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL
+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL
+#endif
+
+#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_lazy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_dictMatchState(
+size_t ZSTD_compressBlock_lazy_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy_dictMatchState(
+size_t ZSTD_compressBlock_lazy_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+size_t ZSTD_compressBlock_lazy_dictMatchState_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-
-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
+size_t ZSTD_compressBlock_lazy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
+size_t ZSTD_compressBlock_lazy_extDict_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
+
+#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy
+#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row
+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState
+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row
+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch
+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row
+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict
+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row
+#else
+#define ZSTD_COMPRESSBLOCK_LAZY NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL
+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL
+#endif
+
+#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_lazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+size_t ZSTD_compressBlock_lazy2_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-
-size_t ZSTD_compressBlock_greedy_extDict(
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_extDict(
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_lazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_greedy_extDict_row(
+size_t ZSTD_compressBlock_lazy2_extDict_row(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy_extDict_row(
+
+#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2
+#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row
+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState
+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row
+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch
+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row
+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict
+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row
+#else
+#define ZSTD_COMPRESSBLOCK_LAZY2 NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL
+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL
+#endif
+
+#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btlazy2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_lazy2_extDict_row(
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btlazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-        
+
+#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2
+#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState
+#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict
+#else
+#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL
+#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL
+#endif
+
 
 
 #endif /* ZSTD_LAZY_H */
diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
index dd86fc83e7dd..07f3bc6437ce 100644
--- a/lib/zstd/compress/zstd_ldm.c
+++ b/lib/zstd/compress/zstd_ldm.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -242,11 +243,15 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
     switch(ms->cParams.strategy)
     {
     case ZSTD_fast:
-        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
+        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
         break;
 
     case ZSTD_dfast:
-        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR
+        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx);
+#else
+        assert(0); /* shouldn't be called: cparams should've been adjusted. */
+#endif
         break;
 
     case ZSTD_greedy:
@@ -318,7 +323,9 @@ static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
     }
 }
 
-static size_t ZSTD_ldm_generateSequences_internal(
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_ldm_generateSequences_internal(
         ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
         ldmParams_t const* params, void const* src, size_t srcSize)
 {
@@ -549,7 +556,7 @@ size_t ZSTD_ldm_generateSequences(
          * the window through early invalidation.
          * TODO: * Test the chunk size.
          *       * Try invalidation after the sequence generation and test the
-         *         the offset against maxDist directly.
+         *         offset against maxDist directly.
          *
          * NOTE: Because of dictionaries + sequence splitting we MUST make sure
          * that any offset used is valid at the END of the sequence, since it may
@@ -689,7 +696,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
         /* maybeSplitSequence updates rawSeqStore->pos */
         rawSeq const sequence = maybeSplitSequence(rawSeqStore,
                                                    (U32)(iend - ip), minMatch);
-        int i;
         /* End signal */
         if (sequence.offset == 0)
             break;
@@ -702,6 +708,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
         /* Run the block compressor */
         DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength);
         {
+            int i;
             size_t const newLitLength =
                 blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
             ip += sequence.litLength;
@@ -711,7 +718,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
             rep[0] = sequence.offset;
             /* Store the sequence */
             ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
-                          STORE_OFFSET(sequence.offset),
+                          OFFSET_TO_OFFBASE(sequence.offset),
                           sequence.matchLength);
             ip += sequence.matchLength;
         }
diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h
index fbc6a5e88fd7..c540731abde7 100644
--- a/lib/zstd/compress/zstd_ldm.h
+++ b/lib/zstd/compress/zstd_ldm.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h
index 647f865be290..cfccfc46f6f7 100644
--- a/lib/zstd/compress/zstd_ldm_geartab.h
+++ b/lib/zstd/compress/zstd_ldm_geartab.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
index fd82acfda62f..a87b66ac8d24 100644
--- a/lib/zstd/compress/zstd_opt.c
+++ b/lib/zstd/compress/zstd_opt.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -12,11 +13,14 @@
 #include "hist.h"
 #include "zstd_opt.h"
 
+#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
 
 #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
 #define ZSTD_MAX_PRICE     (1<<30)
 
-#define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+#define ZSTD_PREDEF_THRESHOLD 8   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
 
 
 /*-*************************************
@@ -26,27 +30,35 @@
 #if 0    /* approximation at bit level (for tests) */
 #  define BITCOST_ACCURACY 0
 #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#  define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
+#  define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat))
 #elif 0  /* fractional bit accuracy (for tests) */
 #  define BITCOST_ACCURACY 8
 #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
+#  define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat))
 #else    /* opt==approx, ultra==accurate */
 #  define BITCOST_ACCURACY 8
 #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+#  define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
 #endif
 
+/* ZSTD_bitWeight() :
+ * provide estimated "cost" of a stat in full bits only */
 MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
 {
     return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
 }
 
+/* ZSTD_fracWeight() :
+ * provide fractional-bit "cost" of a stat,
+ * using linear interpolation approximation */
 MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
 {
     U32 const stat = rawStat + 1;
     U32 const hb = ZSTD_highbit32(stat);
     U32 const BWeight = hb * BITCOST_MULTIPLIER;
+    /* Fweight was meant for "Fractional weight"
+     * but it's effectively a value between 1 and 2
+     * using fixed point arithmetic */
     U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
     U32 const weight = BWeight + FWeight;
     assert(hb + BITCOST_ACCURACY < 31);
@@ -57,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
 /* debugging function,
  * @return price in bytes as fractional value
  * for debug messages only */
-MEM_STATIC double ZSTD_fCost(U32 price)
+MEM_STATIC double ZSTD_fCost(int price)
 {
     return (double)price / (BITCOST_MULTIPLIER*8);
 }
@@ -88,20 +100,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts)
     return total;
 }
 
-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift)
+typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e;
+
+static U32
+ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1)
 {
     U32 s, sum=0;
-    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift);
+    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)",
+            (unsigned)lastEltIndex+1, (unsigned)shift );
     assert(shift < 30);
     for (s=0; s<lastEltIndex+1; s++) {
-        table[s] = 1 + (table[s] >> shift);
-        sum += table[s];
+        unsigned const base = base1 ? 1 : (table[s]>0);
+        unsigned const newStat = base + (table[s] >> shift);
+        sum += newStat;
+        table[s] = newStat;
     }
     return sum;
 }
 
 /* ZSTD_scaleStats() :
- * reduce all elements in table is sum too large
+ * reduce all elt frequencies in table if sum too large
  * return the resulting sum of elements */
 static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
 {
@@ -110,7 +128,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget)
     DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget);
     assert(logTarget < 30);
     if (factor <= 1) return prevsum;
-    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor));
+    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed);
 }
 
 /* ZSTD_rescaleFreqs() :
@@ -129,18 +147,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
     DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
     optPtr->priceType = zop_dynamic;
 
-    if (optPtr->litLengthSum == 0) {  /* first block : init */
-        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {  /* heuristic */
-            DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
+    if (optPtr->litLengthSum == 0) {  /* no literals stats collected -> first block assumed -> init */
+
+        /* heuristic: use pre-defined stats for too small inputs */
+        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {
+            DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD);
             optPtr->priceType = zop_predef;
         }
 
         assert(optPtr->symbolCosts != NULL);
         if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
-            /* huffman table presumed generated by dictionary */
+
+            /* huffman stats covering the full value set : table presumed generated by dictionary */
             optPtr->priceType = zop_dynamic;
 
             if (compressedLiterals) {
+                /* generate literals statistics from huffman table */
                 unsigned lit;
                 assert(optPtr->litFreq != NULL);
                 optPtr->litSum = 0;
@@ -188,13 +210,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
                     optPtr->offCodeSum += optPtr->offCodeFreq[of];
             }   }
 
-        } else {  /* not a dictionary */
+        } else {  /* first block, no dictionary */
 
             assert(optPtr->litFreq != NULL);
             if (compressedLiterals) {
+                /* base initial cost of literals on direct frequency within src */
                 unsigned lit = MaxLit;
                 HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
-                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8);
+                optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible);
             }
 
             {   unsigned const baseLLfreqs[MaxLL+1] = {
@@ -224,10 +247,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
                 optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1);
             }
 
-
         }
 
-    } else {   /* new block : re-use previous statistics, scaled down */
+    } else {   /* new block : scale down accumulated statistics */
 
         if (compressedLiterals)
             optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12);
@@ -246,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
                                 const optState_t* const optPtr,
                                 int optLevel)
 {
+    DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength);
     if (litLength == 0) return 0;
 
     if (!ZSTD_compressedLiterals(optPtr))
@@ -255,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
         return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
 
     /* dynamic statistics */
-    {   U32 price = litLength * optPtr->litSumBasePrice;
+    {   U32 price = optPtr->litSumBasePrice * litLength;
+        U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER;
         U32 u;
+        assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER);
         for (u=0; u < litLength; u++) {
-            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
-            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+            U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+            if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax;
+            price -= litPrice;
         }
         return price;
     }
@@ -272,10 +298,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
     assert(litLength <= ZSTD_BLOCKSIZE_MAX);
     if (optPtr->priceType == zop_predef)
         return WEIGHT(litLength, optLevel);
-    /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
-     * because it isn't representable in the zstd format. So instead just
-     * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block
-     * would be all literals.
+
+    /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX
+     * because it isn't representable in the zstd format.
+     * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1.
+     * In such a case, the block would be all literals.
      */
     if (litLength == ZSTD_BLOCKSIZE_MAX)
         return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel);
@@ -289,24 +316,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP
 }
 
 /* ZSTD_getMatchPrice() :
- * Provides the cost of the match part (offset + matchLength) of a sequence
+ * Provides the cost of the match part (offset + matchLength) of a sequence.
  * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2
+ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq()
  * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency)
  */
 FORCE_INLINE_TEMPLATE U32
-ZSTD_getMatchPrice(U32 const offcode,
+ZSTD_getMatchPrice(U32 const offBase,
                    U32 const matchLength,
              const optState_t* const optPtr,
                    int const optLevel)
 {
     U32 price;
-    U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
+    U32 const offCode = ZSTD_highbit32(offBase);
     U32 const mlBase = matchLength - MINMATCH;
     assert(matchLength >= MINMATCH);
 
-    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
-        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
+    if (optPtr->priceType == zop_predef)  /* fixed scheme, does not use statistics */
+        return WEIGHT(mlBase, optLevel)
+             + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */
 
     /* dynamic statistics */
     price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
@@ -325,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offcode,
 }
 
 /* ZSTD_updateStats() :
- * assumption : literals + litLengtn <= iend */
+ * assumption : literals + litLength <= iend */
 static void ZSTD_updateStats(optState_t* const optPtr,
                              U32 litLength, const BYTE* literals,
-                             U32 offsetCode, U32 matchLength)
+                             U32 offBase, U32 matchLength)
 {
     /* literals */
     if (ZSTD_compressedLiterals(optPtr)) {
@@ -344,8 +372,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
         optPtr->litLengthSum++;
     }
 
-    /* offset code : expected to follow storeSeq() numeric representation */
-    {   U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode));
+    /* offset code : follows storeSeq() numeric representation */
+    {   U32 const offCode = ZSTD_highbit32(offBase);
         assert(offCode <= MaxOff);
         optPtr->offCodeFreq[offCode]++;
         optPtr->offCodeSum++;
@@ -379,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
 
 /* Update hashTable3 up to ip (excluded)
    Assumption : always within prefix (i.e. not within extDict) */
-static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
-                                              U32* nextToUpdate3,
-                                              const BYTE* const ip)
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
+                                       U32* nextToUpdate3,
+                                       const BYTE* const ip)
 {
     U32* const hashTable3 = ms->hashTable3;
     U32 const hashLog3 = ms->hashLog3;
@@ -408,7 +438,9 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
  * @param ip assumed <= iend-8 .
  * @param target The target of ZSTD_updateTree_internal() - we are filling to this position
  * @return : nb of positions added */
-static U32 ZSTD_insertBt1(
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_insertBt1(
                 const ZSTD_matchState_t* ms,
                 const BYTE* const ip, const BYTE* const iend,
                 U32 const target,
@@ -527,6 +559,7 @@ static U32 ZSTD_insertBt1(
 }
 
 FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
 void ZSTD_updateTree_internal(
                 ZSTD_matchState_t* ms,
                 const BYTE* const ip, const BYTE* const iend,
@@ -535,7 +568,7 @@ void ZSTD_updateTree_internal(
     const BYTE* const base = ms->window.base;
     U32 const target = (U32)(ip - base);
     U32 idx = ms->nextToUpdate;
-    DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
+    DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
                 idx, target, dictMode);
 
     while(idx < target) {
@@ -553,15 +586,18 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
 }
 
 FORCE_INLINE_TEMPLATE
-U32 ZSTD_insertBtAndGetAllMatches (
-                    ZSTD_match_t* matches,   /* store result (found matches) in this table (presumed large enough) */
-                    ZSTD_matchState_t* ms,
-                    U32* nextToUpdate3,
-                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
-                    const U32 rep[ZSTD_REP_NUM],
-                    U32 const ll0,   /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
-                    const U32 lengthToBeat,
-                    U32 const mls /* template */)
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32
+ZSTD_insertBtAndGetAllMatches (
+                ZSTD_match_t* matches,  /* store result (found matches) in this table (presumed large enough) */
+                ZSTD_matchState_t* ms,
+                U32* nextToUpdate3,
+                const BYTE* const ip, const BYTE* const iLimit,
+                const ZSTD_dictMode_e dictMode,
+                const U32 rep[ZSTD_REP_NUM],
+                const U32 ll0,  /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+                const U32 lengthToBeat,
+                const U32 mls /* template */)
 {
     const ZSTD_compressionParameters* const cParams = &ms->cParams;
     U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
@@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
                 DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
                             repCode, ll0, repOffset, repLen);
                 bestLength = repLen;
-                matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
+                matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1);  /* expect value between 1 and 3 */
                 matches[mnum].len = (U32)repLen;
                 mnum++;
                 if ( (repLen > sufficient_len)
@@ -673,7 +709,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
                 bestLength = mlen;
                 assert(curr > matchIndex3);
                 assert(mnum==0);  /* no prior solution */
-                matches[0].off = STORE_OFFSET(curr - matchIndex3);
+                matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3);
                 matches[0].len = (U32)mlen;
                 mnum = 1;
                 if ( (mlen > sufficient_len) |
@@ -706,13 +742,13 @@ U32 ZSTD_insertBtAndGetAllMatches (
         }
 
         if (matchLength > bestLength) {
-            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
-                    (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
+            DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)",
+                    (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
             assert(matchEndIdx > matchIndex);
             if (matchLength > matchEndIdx - matchIndex)
                 matchEndIdx = matchIndex + (U32)matchLength;
             bestLength = matchLength;
-            matches[mnum].off = STORE_OFFSET(curr - matchIndex);
+            matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
             matches[mnum].len = (U32)matchLength;
             mnum++;
             if ( (matchLength > ZSTD_OPT_NUM)
@@ -754,12 +790,12 @@ U32 ZSTD_insertBtAndGetAllMatches (
 
             if (matchLength > bestLength) {
                 matchIndex = dictMatchIndex + dmsIndexDelta;
-                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
-                        (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex));
+                DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)",
+                        (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex));
                 if (matchLength > matchEndIdx - matchIndex)
                     matchEndIdx = matchIndex + (U32)matchLength;
                 bestLength = matchLength;
-                matches[mnum].off = STORE_OFFSET(curr - matchIndex);
+                matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex);
                 matches[mnum].len = (U32)matchLength;
                 mnum++;
                 if ( (matchLength > ZSTD_OPT_NUM)
@@ -792,7 +828,9 @@ typedef U32 (*ZSTD_getAllMatchesFn)(
     U32 const ll0,
     U32 const lengthToBeat);
 
-FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+U32 ZSTD_btGetAllMatches_internal(
         ZSTD_match_t* matches,
         ZSTD_matchState_t* ms,
         U32* nextToUpdate3,
@@ -960,7 +998,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
                                       const ZSTD_optLdm_t* optLdm, U32 currPosInBlock)
 {
     U32 const posDiff = currPosInBlock - optLdm->startPosInBlock;
-    /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */
+    /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */
     U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff;
 
     /* Ensure that current block position is not outside of the match */
@@ -971,11 +1009,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches,
     }
 
     if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
-        U32 const candidateOffCode = STORE_OFFSET(optLdm->offset);
-        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u",
-                 candidateOffCode, candidateMatchLength, currPosInBlock);
+        U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset);
+        DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u",
+                 candidateOffBase, candidateMatchLength, currPosInBlock);
         matches[*nbMatches].len = candidateMatchLength;
-        matches[*nbMatches].off = candidateOffCode;
+        matches[*nbMatches].off = candidateOffBase;
         (*nbMatches)++;
     }
 }
@@ -1011,11 +1049,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
 *  Optimal parser
 *********************************/
 
-static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
-{
-    return sol.litlen + sol.mlen;
-}
-
 #if 0 /* debug */
 
 static void
@@ -1033,7 +1066,13 @@ listStats(const U32* table, int lastEltID)
 
 #endif
 
-FORCE_INLINE_TEMPLATE size_t
+#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel)
+#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel)
+#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1))
+
+FORCE_INLINE_TEMPLATE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t
 ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                                seqStore_t* seqStore,
                                U32 rep[ZSTD_REP_NUM],
@@ -1059,9 +1098,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
 
     ZSTD_optimal_t* const opt = optStatePtr->priceTable;
     ZSTD_match_t* const matches = optStatePtr->matchTable;
-    ZSTD_optimal_t lastSequence;
+    ZSTD_optimal_t lastStretch;
     ZSTD_optLdm_t optLdm;
 
+    ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t));
+
     optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore;
     optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0;
     ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip));
@@ -1082,103 +1123,139 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
             U32 const ll0 = !litlen;
             U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch);
             ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
-                                              (U32)(ip-istart), (U32)(iend - ip));
-            if (!nbMatches) { ip++; continue; }
+                                              (U32)(ip-istart), (U32)(iend-ip));
+            if (!nbMatches) {
+                DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart));
+                ip++;
+                continue;
+            }
+
+            /* Match found: let's store this solution, and eventually find more candidates.
+             * During this forward pass, @opt is used to store stretches,
+             * defined as "a match followed by N literals".
+             * Note how this is different from a Sequence, which is "N literals followed by a match".
+             * Storing stretches allows us to store different match predecessors
+             * for each literal position part of a literals run. */
 
             /* initialize opt[0] */
-            { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
-            opt[0].mlen = 0;  /* means is_a_literal */
+            opt[0].mlen = 0;  /* there are only literals so far */
             opt[0].litlen = litlen;
-            /* We don't need to include the actual price of the literals because
-             * it is static for the duration of the forward pass, and is included
-             * in every price. We include the literal length to avoid negative
-             * prices when we subtract the previous literal length.
+            /* No need to include the actual price of the literals before the first match
+             * because it is static for the duration of the forward pass, and is included
+             * in every subsequent price. But, we include the literal length because
+             * the cost variation of litlen depends on the value of litlen.
              */
-            opt[0].price = (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);
+            opt[0].price = LL_PRICE(litlen);
+            ZSTD_STATIC_ASSERT(sizeof(opt[0].rep[0]) == sizeof(rep[0]));
+            ZSTD_memcpy(&opt[0].rep, rep, sizeof(opt[0].rep));
 
             /* large match -> immediate encoding */
             {   U32 const maxML = matches[nbMatches-1].len;
-                U32 const maxOffcode = matches[nbMatches-1].off;
-                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
-                            nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart));
+                U32 const maxOffBase = matches[nbMatches-1].off;
+                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series",
+                            nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart));
 
                 if (maxML > sufficient_len) {
-                    lastSequence.litlen = litlen;
-                    lastSequence.mlen = maxML;
-                    lastSequence.off = maxOffcode;
-                    DEBUGLOG(6, "large match (%u>%u), immediate encoding",
+                    lastStretch.litlen = 0;
+                    lastStretch.mlen = maxML;
+                    lastStretch.off = maxOffBase;
+                    DEBUGLOG(6, "large match (%u>%u) => immediate encoding",
                                 maxML, sufficient_len);
                     cur = 0;
-                    last_pos = ZSTD_totalLen(lastSequence);
+                    last_pos = maxML;
                     goto _shortestPath;
             }   }
 
             /* set prices for first matches starting position == 0 */
             assert(opt[0].price >= 0);
-            {   U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
-                U32 pos;
+            {   U32 pos;
                 U32 matchNb;
                 for (pos = 1; pos < minMatch; pos++) {
-                    opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
+                    opt[pos].price = ZSTD_MAX_PRICE;
+                    opt[pos].mlen = 0;
+                    opt[pos].litlen = litlen + pos;
                 }
                 for (matchNb = 0; matchNb < nbMatches; matchNb++) {
-                    U32 const offcode = matches[matchNb].off;
+                    U32 const offBase = matches[matchNb].off;
                     U32 const end = matches[matchNb].len;
                     for ( ; pos <= end ; pos++ ) {
-                        U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel);
-                        U32 const sequencePrice = literalsPrice + matchPrice;
+                        int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel);
+                        int const sequencePrice = opt[0].price + matchPrice;
                         DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
                                     pos, ZSTD_fCost(sequencePrice));
                         opt[pos].mlen = pos;
-                        opt[pos].off = offcode;
-                        opt[pos].litlen = litlen;
-                        opt[pos].price = (int)sequencePrice;
-                }   }
+                        opt[pos].off = offBase;
+                        opt[pos].litlen = 0; /* end of match */
+                        opt[pos].price = sequencePrice + LL_PRICE(0);
+                    }
+                }
                 last_pos = pos-1;
+                opt[pos].price = ZSTD_MAX_PRICE;
             }
         }
 
         /* check further positions */
         for (cur = 1; cur <= last_pos; cur++) {
             const BYTE* const inr = ip + cur;
-            assert(cur < ZSTD_OPT_NUM);
-            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
+            assert(cur <= ZSTD_OPT_NUM);
+            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur);
 
             /* Fix current position with one literal if cheaper */
-            {   U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
+            {   U32 const litlen = opt[cur-1].litlen + 1;
                 int const price = opt[cur-1].price
-                                + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
-                                + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
-                                - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
+                                + LIT_PRICE(ip+cur-1)
+                                + LL_INCPRICE(litlen);
                 assert(price < 1000000000); /* overflow check */
                 if (price <= opt[cur].price) {
+                    ZSTD_optimal_t const prevMatch = opt[cur];
                     DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
                                 inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
                                 opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
-                    opt[cur].mlen = 0;
-                    opt[cur].off = 0;
+                    opt[cur] = opt[cur-1];
                     opt[cur].litlen = litlen;
                     opt[cur].price = price;
+                    if ( (optLevel >= 1) /* additional check only for higher modes */
+                      && (prevMatch.litlen == 0) /* replace a match */
+                      && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */
+                      && LIKELY(ip + cur < iend)
+                    ) {
+                        /* check next position, in case it would be cheaper */
+                        int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1);
+                        int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1);
+                        DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f",
+                                cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals));
+                        if ( (with1literal < withMoreLiterals)
+                          && (with1literal < opt[cur+1].price) ) {
+                            /* update offset history - before it disappears */
+                            U32 const prev = cur - prevMatch.mlen;
+                            repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0);
+                            assert(cur >= prevMatch.mlen);
+                            DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !",
+                                        ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals),
+                                        newReps.rep[0], newReps.rep[1], newReps.rep[2] );
+                            opt[cur+1] = prevMatch;  /* mlen & offbase */
+                            ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(repcodes_t));
+                            opt[cur+1].litlen = 1;
+                            opt[cur+1].price = with1literal;
+                            if (last_pos < cur+1) last_pos = cur+1;
+                        }
+                    }
                 } else {
-                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
-                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
-                                opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f)",
+                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price));
                 }
             }
 
-            /* Set the repcodes of the current position. We must do it here
-             * because we rely on the repcodes of the 2nd to last sequence being
-             * correct to set the next chunks repcodes during the backward
-             * traversal.
+            /* Offset history is not updated during match comparison.
+             * Do it here, now that the match is selected and confirmed.
              */
             ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t));
             assert(cur >= opt[cur].mlen);
-            if (opt[cur].mlen != 0) {
+            if (opt[cur].litlen == 0) {
+                /* just finished a match => alter offset history */
                 U32 const prev = cur - opt[cur].mlen;
-                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);
+                repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0);
                 ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
-            } else {
-                ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t));
             }
 
             /* last match must start at a minimum distance of 8 from oend */
@@ -1188,15 +1265,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
 
             if ( (optLevel==0) /*static_test*/
               && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
-                DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
+                DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1);
                 continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
             }
 
             assert(opt[cur].price >= 0);
-            {   U32 const ll0 = (opt[cur].mlen != 0);
-                U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
-                U32 const previousPrice = (U32)opt[cur].price;
-                U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+            {   U32 const ll0 = (opt[cur].litlen == 0);
+                int const previousPrice = opt[cur].price;
+                int const basePrice = previousPrice + LL_PRICE(0);
                 U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch);
                 U32 matchNb;
 
@@ -1208,18 +1284,17 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                     continue;
                 }
 
-                {   U32 const maxML = matches[nbMatches-1].len;
-                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
-                                inr-istart, cur, nbMatches, maxML);
-
-                    if ( (maxML > sufficient_len)
-                      || (cur + maxML >= ZSTD_OPT_NUM) ) {
-                        lastSequence.mlen = maxML;
-                        lastSequence.off = matches[nbMatches-1].off;
-                        lastSequence.litlen = litlen;
-                        cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0;  /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
-                        last_pos = cur + ZSTD_totalLen(lastSequence);
-                        if (cur > ZSTD_OPT_NUM) cur = 0;   /* underflow => first match */
+                {   U32 const longestML = matches[nbMatches-1].len;
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of longest ML=%u",
+                                inr-istart, cur, nbMatches, longestML);
+
+                    if ( (longestML > sufficient_len)
+                      || (cur + longestML >= ZSTD_OPT_NUM)
+                      || (ip + cur + longestML >= iend) ) {
+                        lastStretch.mlen = longestML;
+                        lastStretch.off = matches[nbMatches-1].off;
+                        lastStretch.litlen = 0;
+                        last_pos = cur + longestML;
                         goto _shortestPath;
                 }   }
 
@@ -1230,20 +1305,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                     U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
                     U32 mlen;
 
-                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
-                                matchNb, matches[matchNb].off, lastML, litlen);
+                    DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u",
+                                matchNb, matches[matchNb].off, lastML, opt[cur].litlen);
 
                     for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
                         U32 const pos = cur + mlen;
-                        int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+                        int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
 
                         if ((pos > last_pos) || (price < opt[pos].price)) {
                             DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
                                         pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
-                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }   /* fill empty positions */
+                            while (last_pos < pos) {
+                                /* fill empty positions, for future comparisons */
+                                last_pos++;
+                                opt[last_pos].price = ZSTD_MAX_PRICE;
+                                opt[last_pos].litlen = !0;  /* just needs to be != 0, to mean "not an end of match" */
+                            }
                             opt[pos].mlen = mlen;
                             opt[pos].off = offset;
-                            opt[pos].litlen = litlen;
+                            opt[pos].litlen = 0;
                             opt[pos].price = price;
                         } else {
                             DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
@@ -1251,52 +1331,86 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                             if (optLevel==0) break;  /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
                         }
             }   }   }
+            opt[last_pos+1].price = ZSTD_MAX_PRICE;
         }  /* for (cur = 1; cur <= last_pos; cur++) */
 
-        lastSequence = opt[last_pos];
-        cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0;  /* single sequence, and it starts before `ip` */
-        assert(cur < ZSTD_OPT_NUM);  /* control overflow*/
+        lastStretch = opt[last_pos];
+        assert(cur >= lastStretch.mlen);
+        cur = last_pos - lastStretch.mlen;
 
 _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
         assert(opt[0].mlen == 0);
+        assert(last_pos >= lastStretch.mlen);
+        assert(cur == last_pos - lastStretch.mlen);
 
-        /* Set the next chunk's repcodes based on the repcodes of the beginning
-         * of the last match, and the last sequence. This avoids us having to
-         * update them while traversing the sequences.
-         */
-        if (lastSequence.mlen != 0) {
-            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);
-            ZSTD_memcpy(rep, &reps, sizeof(reps));
+        if (lastStretch.mlen==0) {
+            /* no solution : all matches have been converted into literals */
+            assert(lastStretch.litlen == (ip - anchor) + last_pos);
+            ip += last_pos;
+            continue;
+        }
+        assert(lastStretch.off > 0);
+
+        /* Update offset history */
+        if (lastStretch.litlen == 0) {
+            /* finishing on a match : update offset history */
+            repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0);
+            ZSTD_memcpy(rep, &reps, sizeof(repcodes_t));
         } else {
-            ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
+            ZSTD_memcpy(rep, lastStretch.rep, sizeof(repcodes_t));
+            assert(cur >= lastStretch.litlen);
+            cur -= lastStretch.litlen;
         }
 
-        {   U32 const storeEnd = cur + 1;
+        /* Let's write the shortest path solution.
+         * It is stored in @opt in reverse order,
+         * starting from @storeEnd (==cur+2),
+         * effectively partially @opt overwriting.
+         * Content is changed too:
+         * - So far, @opt stored stretches, aka a match followed by literals
+         * - Now, it will store sequences, aka literals followed by a match
+         */
+        {   U32 const storeEnd = cur + 2;
             U32 storeStart = storeEnd;
-            U32 seqPos = cur;
+            U32 stretchPos = cur;
 
             DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
                         last_pos, cur); (void)last_pos;
-            assert(storeEnd < ZSTD_OPT_NUM);
-            DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
-                        storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
-            opt[storeEnd] = lastSequence;
-            while (seqPos > 0) {
-                U32 const backDist = ZSTD_totalLen(opt[seqPos]);
+            assert(storeEnd < ZSTD_OPT_SIZE);
+            DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+                        storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off);
+            if (lastStretch.litlen > 0) {
+                /* last "sequence" is unfinished: just a bunch of literals */
+                opt[storeEnd].litlen = lastStretch.litlen;
+                opt[storeEnd].mlen = 0;
+                storeStart = storeEnd-1;
+                opt[storeStart] = lastStretch;
+            } {
+                opt[storeEnd] = lastStretch;  /* note: litlen will be fixed */
+                storeStart = storeEnd;
+            }
+            while (1) {
+                ZSTD_optimal_t nextStretch = opt[stretchPos];
+                opt[storeStart].litlen = nextStretch.litlen;
+                DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)",
+                            opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off);
+                if (nextStretch.mlen == 0) {
+                    /* reaching beginning of segment */
+                    break;
+                }
                 storeStart--;
-                DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
-                            seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
-                opt[storeStart] = opt[seqPos];
-                seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
+                opt[storeStart] = nextStretch; /* note: litlen will be fixed */
+                assert(nextStretch.litlen + nextStretch.mlen <= stretchPos);
+                stretchPos -= nextStretch.litlen + nextStretch.mlen;
             }
 
             /* save sequences */
-            DEBUGLOG(6, "sending selected sequences into seqStore")
+            DEBUGLOG(6, "sending selected sequences into seqStore");
             {   U32 storePos;
                 for (storePos=storeStart; storePos <= storeEnd; storePos++) {
                     U32 const llen = opt[storePos].litlen;
                     U32 const mlen = opt[storePos].mlen;
-                    U32 const offCode = opt[storePos].off;
+                    U32 const offBase = opt[storePos].off;
                     U32 const advance = llen + mlen;
                     DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
                                 anchor - istart, (unsigned)llen, (unsigned)mlen);
@@ -1308,11 +1422,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
                     }
 
                     assert(anchor + llen <= iend);
-                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
-                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen);
+                    ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen);
+                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen);
                     anchor += advance;
                     ip = anchor;
             }   }
+            DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]);
+
+            /* update all costs */
             ZSTD_setBasePrices(optStatePtr, optLevel);
         }
     }   /* while (ip < ilimit) */
@@ -1320,21 +1437,27 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
     /* Return the last literals size */
     return (size_t)(iend - anchor);
 }
+#endif /* build exclusions */
 
+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
 static size_t ZSTD_compressBlock_opt0(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
 {
     return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode);
 }
+#endif
 
+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
 static size_t ZSTD_compressBlock_opt2(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
 {
     return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode);
 }
+#endif
 
+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
 size_t ZSTD_compressBlock_btopt(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         const void* src, size_t srcSize)
@@ -1342,20 +1465,23 @@ size_t ZSTD_compressBlock_btopt(
     DEBUGLOG(5, "ZSTD_compressBlock_btopt");
     return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
 }
+#endif
 
 
 
 
+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
 /* ZSTD_initStats_ultra():
  * make a first compression pass, just to seed stats with more accurate starting values.
  * only works on first block, with no dictionary and no ldm.
- * this function cannot error, hence its contract must be respected.
+ * this function cannot error out, its narrow contract must be respected.
  */
-static void
-ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
-                     seqStore_t* seqStore,
-                     U32 rep[ZSTD_REP_NUM],
-               const void* src, size_t srcSize)
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+void ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+                          seqStore_t* seqStore,
+                          U32 rep[ZSTD_REP_NUM],
+                    const void* src, size_t srcSize)
 {
     U32 tmpRep[ZSTD_REP_NUM];  /* updated rep codes will sink here */
     ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep));
@@ -1368,7 +1494,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
 
     ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict);   /* generate stats into ms->opt*/
 
-    /* invalidate first scan from history */
+    /* invalidate first scan from history, only keep entropy stats */
     ZSTD_resetSeqStore(seqStore);
     ms->window.base -= srcSize;
     ms->window.dictLimit += (U32)srcSize;
@@ -1392,10 +1518,10 @@ size_t ZSTD_compressBlock_btultra2(
     U32 const curr = (U32)((const BYTE*)src - ms->window.base);
     DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
 
-    /* 2-pass strategy:
+    /* 2-passes strategy:
      * this strategy makes a first pass over first block to collect statistics
-     * and seed next round's statistics with it.
-     * After 1st pass, function forgets everything, and starts a new block.
+     * in order to seed next round's statistics with it.
+     * After 1st pass, function forgets history, and starts a new block.
      * Consequently, this can only work if no data has been previously loaded in tables,
      * aka, no dictionary, no prefix, no ldm preprocessing.
      * The compression ratio gain is generally small (~0.5% on first block),
@@ -1404,15 +1530,17 @@ size_t ZSTD_compressBlock_btultra2(
     if ( (ms->opt.litLengthSum==0)   /* first block */
       && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
       && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
-      && (curr == ms->window.dictLimit)   /* start of frame, nothing already loaded nor skipped */
-      && (srcSize > ZSTD_PREDEF_THRESHOLD)
+      && (curr == ms->window.dictLimit)    /* start of frame, nothing already loaded nor skipped */
+      && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */
       ) {
         ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
     }
 
     return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict);
 }
+#endif
 
+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
 size_t ZSTD_compressBlock_btopt_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         const void* src, size_t srcSize)
@@ -1420,18 +1548,20 @@ size_t ZSTD_compressBlock_btopt_dictMatchState(
     return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
 }
 
-size_t ZSTD_compressBlock_btultra_dictMatchState(
+size_t ZSTD_compressBlock_btopt_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
 }
+#endif
 
-size_t ZSTD_compressBlock_btopt_extDict(
+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btultra_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
+    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState);
 }
 
 size_t ZSTD_compressBlock_btultra_extDict(
@@ -1440,6 +1570,7 @@ size_t ZSTD_compressBlock_btultra_extDict(
 {
     return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict);
 }
+#endif
 
 /* note : no btultra2 variant for extDict nor dictMatchState,
  * because btultra2 is not meant to work with dictionaries
diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h
index 22b862858ba7..ac1b743d27cd 100644
--- a/lib/zstd/compress/zstd_opt.h
+++ b/lib/zstd/compress/zstd_opt.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,30 +15,40 @@
 
 #include "zstd_compress_internal.h"
 
+#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \
+ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR)
 /* used in ZSTD_loadDictionaryContent() */
 void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
+#endif
 
+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR
 size_t ZSTD_compressBlock_btopt(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_btultra(
+size_t ZSTD_compressBlock_btopt_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-size_t ZSTD_compressBlock_btultra2(
+size_t ZSTD_compressBlock_btopt_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
 
+#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt
+#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState
+#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict
+#else
+#define ZSTD_COMPRESSBLOCK_BTOPT NULL
+#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL
+#endif
 
-size_t ZSTD_compressBlock_btopt_dictMatchState(
+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR
+size_t ZSTD_compressBlock_btultra(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btultra_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-
-size_t ZSTD_compressBlock_btopt_extDict(
-        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btultra_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
@@ -45,6 +56,20 @@ size_t ZSTD_compressBlock_btultra_extDict(
         /* note : no btultra2 variant for extDict nor dictMatchState,
          * because btultra2 is not meant to work with dictionaries
          * and is only specific for the first block (no prefix) */
+size_t ZSTD_compressBlock_btultra2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra
+#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState
+#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict
+#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2
+#else
+#define ZSTD_COMPRESSBLOCK_BTULTRA NULL
+#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL
+#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL
+#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL
+#endif
 
 
 #endif /* ZSTD_OPT_H */
diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
index 60958afebc41..ac8b87f48f84 100644
--- a/lib/zstd/decompress/huf_decompress.c
+++ b/lib/zstd/decompress/huf_decompress.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /* ******************************************************************
  * huff0 huffman decoder,
  * part of Finite State Entropy library
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  *  You can contact the author at :
  *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
@@ -19,10 +20,10 @@
 #include "../common/compiler.h"
 #include "../common/bitstream.h"  /* BIT_* */
 #include "../common/fse.h"        /* to compress headers */
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "../common/error_private.h"
 #include "../common/zstd_internal.h"
+#include "../common/bits.h"       /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */
 
 /* **************************************************************
 *  Constants
@@ -34,6 +35,12 @@
 *  Macros
 ****************************************************************/
 
+#ifdef HUF_DISABLE_FAST_DECODE
+# define HUF_ENABLE_FAST_DECODE 0
+#else
+# define HUF_ENABLE_FAST_DECODE 1
+#endif
+
 /* These two optional macros force the use one way or another of the two
  * Huffman decompression implementations. You can't force in both directions
  * at the same time.
@@ -43,27 +50,25 @@
 #error "Cannot force the use of the X1 and X2 decoders at the same time!"
 #endif
 
-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is
+ * supported at runtime, so we can add the BMI2 target attribute.
+ * When it is disabled, we will still get BMI2 if it is enabled statically.
+ */
+#if DYNAMIC_BMI2
+# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
 #else
-# define HUF_ASM_X86_64_BMI2_ATTRS
+# define HUF_FAST_BMI2_ATTRS
 #endif
 
 #define HUF_EXTERN_C
 #define HUF_ASM_DECL HUF_EXTERN_C
 
-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+#if DYNAMIC_BMI2
 # define HUF_NEED_BMI2_FUNCTION 1
 #else
 # define HUF_NEED_BMI2_FUNCTION 0
 #endif
 
-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
-# define HUF_NEED_DEFAULT_FUNCTION 1
-#else
-# define HUF_NEED_DEFAULT_FUNCTION 0
-#endif
-
 /* **************************************************************
 *  Error Management
 ****************************************************************/
@@ -80,6 +85,11 @@
 /* **************************************************************
 *  BMI2 Variant Wrappers
 ****************************************************************/
+typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize,
+                                              const void *cSrc,
+                                              size_t cSrcSize,
+                                              const HUF_DTable *DTable);
+
 #if DYNAMIC_BMI2
 
 #define HUF_DGEN(fn)                                                        \
@@ -101,9 +111,9 @@
     }                                                                       \
                                                                             \
     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
     {                                                                       \
-        if (bmi2) {                                                         \
+        if (flags & HUF_flags_bmi2) {                                       \
             return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
         }                                                                   \
         return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
@@ -113,9 +123,9 @@
 
 #define HUF_DGEN(fn)                                                        \
     static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
-                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+                     size_t cSrcSize, HUF_DTable const* DTable, int flags)  \
     {                                                                       \
-        (void)bmi2;                                                         \
+        (void)flags;                                                        \
         return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
     }
 
@@ -134,43 +144,66 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
     return dtd;
 }
 
-#if ZSTD_ENABLE_ASM_X86_64_BMI2
-
-static size_t HUF_initDStream(BYTE const* ip) {
+static size_t HUF_initFastDStream(BYTE const* ip) {
     BYTE const lastByte = ip[7];
-    size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+    size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0;
     size_t const value = MEM_readLEST(ip) | 1;
     assert(bitsConsumed <= 8);
+    assert(sizeof(size_t) == 8);
     return value << bitsConsumed;
 }
+
+
+/*
+ * The input/output arguments to the Huffman fast decoding loop:
+ *
+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed.
+ * op [in/out] - The output pointers, must be updated to reflect what is written.
+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state.
+ * dt [in] - The decoding table.
+ * ilowest [in] - The beginning of the valid range of the input. Decoders may read
+ *                down to this pointer. It may be below iend[0].
+ * oend [in] - The end of the output stream. op[3] must not cross oend.
+ * iend [in] - The end of each input stream. ip[i] may cross iend[i],
+ *             as long as it is above ilowest, but that indicates corruption.
+ */
 typedef struct {
     BYTE const* ip[4];
     BYTE* op[4];
     U64 bits[4];
     void const* dt;
-    BYTE const* ilimit;
+    BYTE const* ilowest;
     BYTE* oend;
     BYTE const* iend[4];
-} HUF_DecompressAsmArgs;
+} HUF_DecompressFastArgs;
+
+typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*);
 
 /*
- * Initializes args for the asm decoding loop.
- * @returns 0 on success
- *          1 if the fallback implementation should be used.
+ * Initializes args for the fast decoding loop.
+ * @returns 1 on success
+ *          0 if the fallback implementation should be used.
  *          Or an error code on failure.
  */
-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
+static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable)
 {
     void const* dt = DTable + 1;
     U32 const dtLog = HUF_getDTableDesc(DTable).tableLog;
 
-    const BYTE* const ilimit = (const BYTE*)src + 6 + 8;
+    const BYTE* const istart = (const BYTE*)src;
 
-    BYTE* const oend = (BYTE*)dst + dstSize;
+    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
 
-    /* The following condition is false on x32 platform,
-     * but HUF_asm is not compatible with this ABI */
-    if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
+    /* The fast decoding loop assumes 64-bit little-endian.
+     * This condition is false on x32.
+     */
+    if (!MEM_isLittleEndian() || MEM_32bits())
+        return 0;
+
+    /* Avoid nullptr addition */
+    if (dstSize == 0)
+        return 0;
+    assert(dst != NULL);
 
     /* strict minimum : jump table + 1 byte per stream */
     if (srcSize < 10)
@@ -181,11 +214,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
      * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder.
      */
     if (dtLog != HUF_DECODER_FAST_TABLELOG)
-        return 1;
+        return 0;
 
     /* Read the jump table. */
     {
-        const BYTE* const istart = (const BYTE*)src;
         size_t const length1 = MEM_readLE16(istart);
         size_t const length2 = MEM_readLE16(istart+2);
         size_t const length3 = MEM_readLE16(istart+4);
@@ -195,13 +227,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
         args->iend[2] = args->iend[1] + length2;
         args->iend[3] = args->iend[2] + length3;
 
-        /* HUF_initDStream() requires this, and this small of an input
+        /* HUF_initFastDStream() requires this, and this small of an input
          * won't benefit from the ASM loop anyways.
-         * length1 must be >= 16 so that ip[0] >= ilimit before the loop
-         * starts.
          */
-        if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
-            return 1;
+        if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8)
+            return 0;
         if (length4 > srcSize) return ERROR(corruption_detected);   /* overflow */
     }
     /* ip[] contains the position that is currently loaded into bits[]. */
@@ -218,7 +248,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
 
     /* No point to call the ASM loop for tiny outputs. */
     if (args->op[3] >= oend)
-        return 1;
+        return 0;
 
     /* bits[] is the bit container.
         * It is read from the MSB down to the LSB.
@@ -227,24 +257,25 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst,
         * set, so that CountTrailingZeros(bits[]) can be used
         * to count how many bits we've consumed.
         */
-    args->bits[0] = HUF_initDStream(args->ip[0]);
-    args->bits[1] = HUF_initDStream(args->ip[1]);
-    args->bits[2] = HUF_initDStream(args->ip[2]);
-    args->bits[3] = HUF_initDStream(args->ip[3]);
-
-    /* If ip[] >= ilimit, it is guaranteed to be safe to
-        * reload bits[]. It may be beyond its section, but is
-        * guaranteed to be valid (>= istart).
-        */
-    args->ilimit = ilimit;
+    args->bits[0] = HUF_initFastDStream(args->ip[0]);
+    args->bits[1] = HUF_initFastDStream(args->ip[1]);
+    args->bits[2] = HUF_initFastDStream(args->ip[2]);
+    args->bits[3] = HUF_initFastDStream(args->ip[3]);
+
+    /* The decoders must be sure to never read beyond ilowest.
+     * This is lower than iend[0], but allowing decoders to read
+     * down to ilowest can allow an extra iteration or two in the
+     * fast loop.
+     */
+    args->ilowest = istart;
 
     args->oend = oend;
     args->dt = dt;
 
-    return 0;
+    return 1;
 }
 
-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd)
+static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd)
 {
     /* Validate that we haven't overwritten. */
     if (args->op[stream] > segmentEnd)
@@ -258,15 +289,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs
         return ERROR(corruption_detected);
 
     /* Construct the BIT_DStream_t. */
-    bit->bitContainer = MEM_readLE64(args->ip[stream]);
-    bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]);
-    bit->start = (const char*)args->iend[0];
+    assert(sizeof(size_t) == 8);
+    bit->bitContainer = MEM_readLEST(args->ip[stream]);
+    bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]);
+    bit->start = (const char*)args->ilowest;
     bit->limitPtr = bit->start + sizeof(size_t);
     bit->ptr = (const char*)args->ip[stream];
 
     return 0;
 }
-#endif
+
+/* Calls X(N) for each stream 0, 1, 2, 3. */
+#define HUF_4X_FOR_EACH_STREAM(X) \
+    do {                          \
+        X(0);                     \
+        X(1);                     \
+        X(2);                     \
+        X(3);                     \
+    } while (0)
+
+/* Calls X(N, var) for each stream 0, 1, 2, 3. */
+#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \
+    do {                                        \
+        X(0, (var));                            \
+        X(1, (var));                            \
+        X(2, (var));                            \
+        X(3, (var));                            \
+    } while (0)
 
 
 #ifndef HUF_FORCE_DECOMPRESS_X2
@@ -283,10 +332,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol decodi
 static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
     U64 D4;
     if (MEM_isLittleEndian()) {
-        D4 = (symbol << 8) + nbBits;
+        D4 = (U64)((symbol << 8) + nbBits);
     } else {
-        D4 = symbol + (nbBits << 8);
+        D4 = (U64)(symbol + (nbBits << 8));
     }
+    assert(D4 < (1U << 16));
     D4 *= 0x0001000100010001ULL;
     return D4;
 }
@@ -329,13 +379,7 @@ typedef struct {
         BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];
 } HUF_ReadDTableX1_Workspace;
 
-
-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
-{
-    return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags)
 {
     U32 tableLog = 0;
     U32 nbSymbols = 0;
@@ -350,7 +394,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
     DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
     /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
 
-    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2);
+    iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags);
     if (HUF_isError(iSize)) return iSize;
 
 
@@ -377,9 +421,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
      * rankStart[0] is not filled because there are no entries in the table for
      * weight 0.
      */
-    {
-        int n;
-        int nextRankStart = 0;
+    {   int n;
+        U32 nextRankStart = 0;
         int const unroll = 4;
         int const nLimit = (int)nbSymbols - unroll + 1;
         for (n=0; n<(int)tableLog+1; n++) {
@@ -406,10 +449,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr
      * We can switch based on the length to a different inner loop which is
      * optimized for that particular case.
      */
-    {
-        U32 w;
-        int symbol=wksp->rankVal[0];
-        int rankStart=0;
+    {   U32 w;
+        int symbol = wksp->rankVal[0];
+        int rankStart = 0;
         for (w=1; w<tableLog+1; ++w) {
             int const symbolCount = wksp->rankVal[w];
             int const length = (1 << w) >> 1;
@@ -483,15 +525,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog
 }
 
 #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
-    *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
+    do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0)
 
-#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)  \
-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
-        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
+#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)      \
+    do {                                            \
+        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
+    } while (0)
 
-#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
-    if (MEM_64bits()) \
-        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
+#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr)      \
+    do {                                            \
+        if (MEM_64bits())                           \
+            HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \
+    } while (0)
 
 HINT_INLINE size_t
 HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
@@ -519,7 +565,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons
     while (p < pEnd)
         HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
 
-    return pEnd-pStart;
+    return (size_t)(pEnd-pStart);
 }
 
 FORCE_INLINE_TEMPLATE size_t
@@ -529,7 +575,7 @@ HUF_decompress1X1_usingDTable_internal_body(
     const HUF_DTable* DTable)
 {
     BYTE* op = (BYTE*)dst;
-    BYTE* const oend = op + dstSize;
+    BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize);
     const void* dtPtr = DTable + 1;
     const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
     BIT_DStream_t bitD;
@@ -545,6 +591,10 @@ HUF_decompress1X1_usingDTable_internal_body(
     return dstSize;
 }
 
+/* HUF_decompress4X1_usingDTable_internal_body():
+ * Conditions :
+ * @dstSize >= 6
+ */
 FORCE_INLINE_TEMPLATE size_t
 HUF_decompress4X1_usingDTable_internal_body(
           void* dst,  size_t dstSize,
@@ -553,6 +603,7 @@ HUF_decompress4X1_usingDTable_internal_body(
 {
     /* Check */
     if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
+    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
 
     {   const BYTE* const istart = (const BYTE*) cSrc;
         BYTE* const ostart = (BYTE*) dst;
@@ -588,6 +639,7 @@ HUF_decompress4X1_usingDTable_internal_body(
 
         if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
         if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
+        assert(dstSize >= 6); /* validated above */
         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -650,52 +702,173 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
 }
 #endif
 
-#if HUF_NEED_DEFAULT_FUNCTION
 static
 size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
                     size_t cSrcSize, HUF_DTable const* DTable) {
     return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
 }
-#endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2
 
-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
+
+#endif
+
+static HUF_FAST_BMI2_ATTRS
+void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
+{
+    U64 bits[4];
+    BYTE const* ip[4];
+    BYTE* op[4];
+    U16 const* const dtable = (U16 const*)args->dt;
+    BYTE* const oend = args->oend;
+    BYTE const* const ilowest = args->ilowest;
+
+    /* Copy the arguments to local variables */
+    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
+    ZSTD_memcpy(&op, &args->op, sizeof(op));
+
+    assert(MEM_isLittleEndian());
+    assert(!MEM_32bits());
+
+    for (;;) {
+        BYTE* olimit;
+        int stream;
+
+        /* Assert loop preconditions */
+#ifndef NDEBUG
+        for (stream = 0; stream < 4; ++stream) {
+            assert(op[stream] <= (stream == 3 ? oend : op[stream + 1]));
+            assert(ip[stream] >= ilowest);
+        }
+#endif
+        /* Compute olimit */
+        {
+            /* Each iteration produces 5 output symbols per stream */
+            size_t const oiters = (size_t)(oend - op[3]) / 5;
+            /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes
+             * per stream.
+             */
+            size_t const iiters = (size_t)(ip[0] - ilowest) / 7;
+            /* We can safely run iters iterations before running bounds checks */
+            size_t const iters = MIN(oiters, iiters);
+            size_t const symbols = iters * 5;
+
+            /* We can simply check that op[3] < olimit, instead of checking all
+             * of our bounds, since we can't hit the other bounds until we've run
+             * iters iterations, which only happens when op[3] == olimit.
+             */
+            olimit = op[3] + symbols;
+
+            /* Exit fast decoding loop once we reach the end. */
+            if (op[3] == olimit)
+                break;
+
+            /* Exit the decoding loop if any input pointer has crossed the
+             * previous one. This indicates corruption, and a precondition
+             * to our loop is that ip[i] >= ip[0].
+             */
+            for (stream = 1; stream < 4; ++stream) {
+                if (ip[stream] < ip[stream - 1])
+                    goto _out;
+            }
+        }
+
+#ifndef NDEBUG
+        for (stream = 1; stream < 4; ++stream) {
+            assert(ip[stream] >= ip[stream - 1]);
+        }
+#endif
+
+#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol)                 \
+    do {                                                        \
+        int const index = (int)(bits[(_stream)] >> 53);         \
+        int const entry = (int)dtable[index];                   \
+        bits[(_stream)] <<= (entry & 0x3F);                     \
+        op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
+    } while (0)
+
+#define HUF_4X1_RELOAD_STREAM(_stream)                              \
+    do {                                                            \
+        int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
+        int const nbBits = ctz & 7;                                 \
+        int const nbBytes = ctz >> 3;                               \
+        op[(_stream)] += 5;                                         \
+        ip[(_stream)] -= nbBytes;                                   \
+        bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
+        bits[(_stream)] <<= nbBits;                                 \
+    } while (0)
+
+        /* Manually unroll the loop because compilers don't consistently
+         * unroll the inner loops, which destroys performance.
+         */
+        do {
+            /* Decode 5 symbols in each of the 4 streams */
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
+
+            /* Reload each of the 4 the bitstreams */
+            HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
+        } while (op[3] < olimit);
+
+#undef HUF_4X1_DECODE_SYMBOL
+#undef HUF_4X1_RELOAD_STREAM
+    }
 
-static HUF_ASM_X86_64_BMI2_ATTRS
+_out:
+
+    /* Save the final values of each of the state variables back to args. */
+    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
+    ZSTD_memcpy(&args->op, &op, sizeof(op));
+}
+
+/*
+ * @returns @p dstSize on success (>= 6)
+ *          0 if the fallback implementation should be used
+ *          An error if an error occurred
+ */
+static HUF_FAST_BMI2_ATTRS
 size_t
-HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+HUF_decompress4X1_usingDTable_internal_fast(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
+    const HUF_DTable* DTable,
+    HUF_DecompressFastLoopFn loopFn)
 {
     void const* dt = DTable + 1;
-    const BYTE* const iend = (const BYTE*)cSrc + 6;
-    BYTE* const oend = (BYTE*)dst + dstSize;
-    HUF_DecompressAsmArgs args;
-    {
-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
-        FORWARD_IF_ERROR(ret, "Failed to init asm args");
-        if (ret != 0)
-            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+    BYTE const* const ilowest = (BYTE const*)cSrc;
+    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
+    HUF_DecompressFastArgs args;
+    {   size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+        FORWARD_IF_ERROR(ret, "Failed to init fast loop args");
+        if (ret == 0)
+            return 0;
     }
 
-    assert(args.ip[0] >= args.ilimit);
-    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
+    assert(args.ip[0] >= args.ilowest);
+    loopFn(&args);
 
-    /* Our loop guarantees that ip[] >= ilimit and that we haven't
+    /* Our loop guarantees that ip[] >= ilowest and that we haven't
     * overwritten any op[].
     */
-    assert(args.ip[0] >= iend);
-    assert(args.ip[1] >= iend);
-    assert(args.ip[2] >= iend);
-    assert(args.ip[3] >= iend);
+    assert(args.ip[0] >= ilowest);
+    assert(args.ip[0] >= ilowest);
+    assert(args.ip[1] >= ilowest);
+    assert(args.ip[2] >= ilowest);
+    assert(args.ip[3] >= ilowest);
     assert(args.op[3] <= oend);
-    (void)iend;
+
+    assert(ilowest == args.ilowest);
+    assert(ilowest + 6 == args.iend[0]);
+    (void)ilowest;
 
     /* finish bit streams one by one. */
-    {
-        size_t const segmentSize = (dstSize+3) / 4;
+    {   size_t const segmentSize = (dstSize+3) / 4;
         BYTE* segmentEnd = (BYTE*)dst;
         int i;
         for (i = 0; i < 4; ++i) {
@@ -712,97 +885,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm(
     }
 
     /* decoded size */
+    assert(dstSize != 0);
     return dstSize;
 }
-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
-
-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
-                                               const void *cSrc,
-                                               size_t cSrcSize,
-                                               const HUF_DTable *DTable);
 
 HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
 
 static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
 {
+    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default;
+    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop;
+
 #if DYNAMIC_BMI2
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
+        fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2;
 # if ZSTD_ENABLE_ASM_X86_64_BMI2
-        return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-# else
-        return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+        if (!(flags & HUF_flags_disableAsm)) {
+            loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
+        }
 # endif
+    } else {
+        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
     }
-#else
-    (void)bmi2;
 #endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
-    return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-#else
-    return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
+    if (!(flags & HUF_flags_disableAsm)) {
+        loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop;
+    }
 #endif
-}
-
-
-size_t HUF_decompress1X1_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 0) return ERROR(GENERIC);
-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-}
 
-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
-{
-    const BYTE* ip = (const BYTE*) cSrc;
-
-    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
-    if (HUF_isError(hSize)) return hSize;
-    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
-    ip += hSize; cSrcSize -= hSize;
-
-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
-}
-
-
-size_t HUF_decompress4X1_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 0) return ERROR(GENERIC);
-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
+        size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
+        if (ret != 0)
+            return ret;
+    }
+    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
 }
 
-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize, int bmi2)
+                                   void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
-}
-
-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
-{
-    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
 }
 
-
 #endif /* HUF_FORCE_DECOMPRESS_X2 */
 
 
@@ -985,7 +1120,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32
 
 static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
                            const sortedSymbol_t* sortedList,
-                           const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight,
+                           const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight,
                            const U32 nbBitsBaseline)
 {
     U32* const rankVal = rankValOrigin[0];
@@ -1040,14 +1175,7 @@ typedef struct {
 
 size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
                        const void* src, size_t srcSize,
-                             void* workSpace, size_t wkspSize)
-{
-    return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
-                       const void* src, size_t srcSize,
-                             void* workSpace, size_t wkspSize, int bmi2)
+                             void* workSpace, size_t wkspSize, int flags)
 {
     U32 tableLog, maxW, nbSymbols;
     DTableDesc dtd = HUF_getDTableDesc(DTable);
@@ -1069,7 +1197,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
     /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
 
-    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2);
+    iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags);
     if (HUF_isError(iSize)) return iSize;
 
     /* check result */
@@ -1159,15 +1287,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c
 }
 
 #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
-    ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+    do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0)
 
-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
-    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
-        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr)                     \
+    do {                                                           \
+        if (MEM_64bits() || (HUF_TABLELOG_MAX<=12))                \
+            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
+    } while (0)
 
-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
-    if (MEM_64bits()) \
-        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr)                     \
+    do {                                                           \
+        if (MEM_64bits())                                          \
+            ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \
+    } while (0)
 
 HINT_INLINE size_t
 HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
@@ -1227,7 +1359,7 @@ HUF_decompress1X2_usingDTable_internal_body(
 
     /* decode */
     {   BYTE* const ostart = (BYTE*) dst;
-        BYTE* const oend = ostart + dstSize;
+        BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize);
         const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
         const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
         DTableDesc const dtd = HUF_getDTableDesc(DTable);
@@ -1240,6 +1372,11 @@ HUF_decompress1X2_usingDTable_internal_body(
     /* decoded size */
     return dstSize;
 }
+
+/* HUF_decompress4X2_usingDTable_internal_body():
+ * Conditions:
+ * @dstSize >= 6
+ */
 FORCE_INLINE_TEMPLATE size_t
 HUF_decompress4X2_usingDTable_internal_body(
           void* dst,  size_t dstSize,
@@ -1247,6 +1384,7 @@ HUF_decompress4X2_usingDTable_internal_body(
     const HUF_DTable* DTable)
 {
     if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+    if (dstSize < 6) return ERROR(corruption_detected);         /* stream 4-split doesn't work */
 
     {   const BYTE* const istart = (const BYTE*) cSrc;
         BYTE* const ostart = (BYTE*) dst;
@@ -1280,8 +1418,9 @@ HUF_decompress4X2_usingDTable_internal_body(
         DTableDesc const dtd = HUF_getDTableDesc(DTable);
         U32 const dtLog = dtd.tableLog;
 
-        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
-        if (opStart4 > oend) return ERROR(corruption_detected);      /* overflow */
+        if (length4 > cSrcSize) return ERROR(corruption_detected);  /* overflow */
+        if (opStart4 > oend) return ERROR(corruption_detected);     /* overflow */
+        assert(dstSize >= 6 /* validated above */);
         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
@@ -1366,44 +1505,191 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo
 }
 #endif
 
-#if HUF_NEED_DEFAULT_FUNCTION
 static
 size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc,
                     size_t cSrcSize, HUF_DTable const* DTable) {
     return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable);
 }
-#endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2
 
-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN;
+
+#endif
+
+static HUF_FAST_BMI2_ATTRS
+void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args)
+{
+    U64 bits[4];
+    BYTE const* ip[4];
+    BYTE* op[4];
+    BYTE* oend[4];
+    HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt;
+    BYTE const* const ilowest = args->ilowest;
+
+    /* Copy the arguments to local registers. */
+    ZSTD_memcpy(&bits, &args->bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip));
+    ZSTD_memcpy(&op, &args->op, sizeof(op));
+
+    oend[0] = op[1];
+    oend[1] = op[2];
+    oend[2] = op[3];
+    oend[3] = args->oend;
+
+    assert(MEM_isLittleEndian());
+    assert(!MEM_32bits());
+
+    for (;;) {
+        BYTE* olimit;
+        int stream;
+
+        /* Assert loop preconditions */
+#ifndef NDEBUG
+        for (stream = 0; stream < 4; ++stream) {
+            assert(op[stream] <= oend[stream]);
+            assert(ip[stream] >= ilowest);
+        }
+#endif
+        /* Compute olimit */
+        {
+            /* Each loop does 5 table lookups for each of the 4 streams.
+             * Each table lookup consumes up to 11 bits of input, and produces
+             * up to 2 bytes of output.
+             */
+            /* We can consume up to 7 bytes of input per iteration per stream.
+             * We also know that each input pointer is >= ip[0]. So we can run
+             * iters loops before running out of input.
+             */
+            size_t iters = (size_t)(ip[0] - ilowest) / 7;
+            /* Each iteration can produce up to 10 bytes of output per stream.
+             * Each output stream my advance at different rates. So take the
+             * minimum number of safe iterations among all the output streams.
+             */
+            for (stream = 0; stream < 4; ++stream) {
+                size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10;
+                iters = MIN(iters, oiters);
+            }
+
+            /* Each iteration produces at least 5 output symbols. So until
+             * op[3] crosses olimit, we know we haven't executed iters
+             * iterations yet. This saves us maintaining an iters counter,
+             * at the expense of computing the remaining # of iterations
+             * more frequently.
+             */
+            olimit = op[3] + (iters * 5);
+
+            /* Exit the fast decoding loop once we reach the end. */
+            if (op[3] == olimit)
+                break;
+
+            /* Exit the decoding loop if any input pointer has crossed the
+             * previous one. This indicates corruption, and a precondition
+             * to our loop is that ip[i] >= ip[0].
+             */
+            for (stream = 1; stream < 4; ++stream) {
+                if (ip[stream] < ip[stream - 1])
+                    goto _out;
+            }
+        }
+
+#ifndef NDEBUG
+        for (stream = 1; stream < 4; ++stream) {
+            assert(ip[stream] >= ip[stream - 1]);
+        }
+#endif
 
-static HUF_ASM_X86_64_BMI2_ATTRS size_t
-HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3)                      \
+    do {                                                              \
+        if ((_decode3) || (_stream) != 3) {                           \
+            int const index = (int)(bits[(_stream)] >> 53);           \
+            HUF_DEltX2 const entry = dtable[index];                   \
+            MEM_write16(op[(_stream)], entry.sequence); \
+            bits[(_stream)] <<= (entry.nbBits) & 0x3F;                \
+            op[(_stream)] += (entry.length);                          \
+        }                                                             \
+    } while (0)
+
+#define HUF_4X2_RELOAD_STREAM(_stream)                                  \
+    do {                                                                \
+        HUF_4X2_DECODE_SYMBOL(3, 1);                                    \
+        {                                                               \
+            int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
+            int const nbBits = ctz & 7;                                 \
+            int const nbBytes = ctz >> 3;                               \
+            ip[(_stream)] -= nbBytes;                                   \
+            bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1;            \
+            bits[(_stream)] <<= nbBits;                                 \
+        }                                                               \
+    } while (0)
+
+        /* Manually unroll the loop because compilers don't consistently
+         * unroll the inner loops, which destroys performance.
+         */
+        do {
+            /* Decode 5 symbols from each of the first 3 streams.
+             * The final stream will be decoded during the reload phase
+             * to reduce register pressure.
+             */
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
+            HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
+
+            /* Decode one symbol from the final stream */
+            HUF_4X2_DECODE_SYMBOL(3, 1);
+
+            /* Decode 4 symbols from the final stream & reload bitstreams.
+             * The final stream is reloaded last, meaning that all 5 symbols
+             * are decoded from the final stream before it is reloaded.
+             */
+            HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
+        } while (op[3] < olimit);
+    }
+
+#undef HUF_4X2_DECODE_SYMBOL
+#undef HUF_4X2_RELOAD_STREAM
+
+_out:
+
+    /* Save the final values of each of the state variables back to args. */
+    ZSTD_memcpy(&args->bits, &bits, sizeof(bits));
+    ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip));
+    ZSTD_memcpy(&args->op, &op, sizeof(op));
+}
+
+
+static HUF_FAST_BMI2_ATTRS size_t
+HUF_decompress4X2_usingDTable_internal_fast(
           void* dst,  size_t dstSize,
     const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable) {
+    const HUF_DTable* DTable,
+    HUF_DecompressFastLoopFn loopFn) {
     void const* dt = DTable + 1;
-    const BYTE* const iend = (const BYTE*)cSrc + 6;
-    BYTE* const oend = (BYTE*)dst + dstSize;
-    HUF_DecompressAsmArgs args;
+    const BYTE* const ilowest = (const BYTE*)cSrc;
+    BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize);
+    HUF_DecompressFastArgs args;
     {
-        size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
+        size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable);
         FORWARD_IF_ERROR(ret, "Failed to init asm args");
-        if (ret != 0)
-            return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+        if (ret == 0)
+            return 0;
     }
 
-    assert(args.ip[0] >= args.ilimit);
-    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
+    assert(args.ip[0] >= args.ilowest);
+    loopFn(&args);
 
     /* note : op4 already verified within main loop */
-    assert(args.ip[0] >= iend);
-    assert(args.ip[1] >= iend);
-    assert(args.ip[2] >= iend);
-    assert(args.ip[3] >= iend);
+    assert(args.ip[0] >= ilowest);
+    assert(args.ip[1] >= ilowest);
+    assert(args.ip[2] >= ilowest);
+    assert(args.ip[3] >= ilowest);
     assert(args.op[3] <= oend);
-    (void)iend;
+
+    assert(ilowest == args.ilowest);
+    assert(ilowest + 6 == args.iend[0]);
+    (void)ilowest;
 
     /* finish bitStreams one by one */
     {
@@ -1426,91 +1712,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm(
     /* decoded size */
     return dstSize;
 }
-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
 
 static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc,
-                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+                    size_t cSrcSize, HUF_DTable const* DTable, int flags)
 {
+    HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default;
+    HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop;
+
 #if DYNAMIC_BMI2
-    if (bmi2) {
+    if (flags & HUF_flags_bmi2) {
+        fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2;
 # if ZSTD_ENABLE_ASM_X86_64_BMI2
-        return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-# else
-        return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);
+        if (!(flags & HUF_flags_disableAsm)) {
+            loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
+        }
 # endif
+    } else {
+        return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
     }
-#else
-    (void)bmi2;
 #endif
 
 #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
-    return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable);
-#else
-    return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable);
+    if (!(flags & HUF_flags_disableAsm)) {
+        loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop;
+    }
 #endif
+
+    if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) {
+        size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn);
+        if (ret != 0)
+            return ret;
+    }
+    return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable);
 }
 
 HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
 
-size_t HUF_decompress1X2_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 1) return ERROR(GENERIC);
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-}
-
 size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
+                                   void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
     size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
-                                               workSpace, wkspSize);
+                                               workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags);
 }
 
-
-size_t HUF_decompress4X2_usingDTable(
-          void* dst,  size_t dstSize,
-    const void* cSrc, size_t cSrcSize,
-    const HUF_DTable* DTable)
-{
-    DTableDesc dtd = HUF_getDTableDesc(DTable);
-    if (dtd.tableType != 1) return ERROR(GENERIC);
-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-}
-
-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                    const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize, int bmi2)
+                                   void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
     size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
-                                         workSpace, wkspSize);
+                                         workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
 }
 
-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
-                                   const void* cSrc, size_t cSrcSize,
-                                   void* workSpace, size_t wkspSize)
-{
-    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
-}
-
-
 #endif /* HUF_FORCE_DECOMPRESS_X1 */
 
 
@@ -1518,44 +1785,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
 /* Universal decompression selectors */
 /* ***********************************/
 
-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
-                                    const void* cSrc, size_t cSrcSize,
-                                    const HUF_DTable* DTable)
-{
-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-    (void)dtd;
-    assert(dtd.tableType == 0);
-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-    (void)dtd;
-    assert(dtd.tableType == 1);
-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#else
-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#endif
-}
-
-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
-                                    const void* cSrc, size_t cSrcSize,
-                                    const HUF_DTable* DTable)
-{
-    DTableDesc const dtd = HUF_getDTableDesc(DTable);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-    (void)dtd;
-    assert(dtd.tableType == 0);
-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-    (void)dtd;
-    assert(dtd.tableType == 1);
-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#else
-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
-#endif
-}
-
 
 #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
 typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
@@ -1610,36 +1839,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
 #endif
 }
 
-
-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
-                                     size_t dstSize, const void* cSrc,
-                                     size_t cSrcSize, void* workSpace,
-                                     size_t wkspSize)
-{
-    /* validation checks */
-    if (dstSize == 0) return ERROR(dstSize_tooSmall);
-    if (cSrcSize == 0) return ERROR(corruption_detected);
-
-    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
-#if defined(HUF_FORCE_DECOMPRESS_X1)
-        (void)algoNb;
-        assert(algoNb == 0);
-        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#elif defined(HUF_FORCE_DECOMPRESS_X2)
-        (void)algoNb;
-        assert(algoNb == 1);
-        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#else
-        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                            cSrcSize, workSpace, wkspSize):
-                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
-#endif
-    }
-}
-
 size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
                                   const void* cSrc, size_t cSrcSize,
-                                  void* workSpace, size_t wkspSize)
+                                  void* workSpace, size_t wkspSize, int flags)
 {
     /* validation checks */
     if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1652,71 +1854,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
         (void)algoNb;
         assert(algoNb == 0);
         return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize);
+                                cSrcSize, workSpace, wkspSize, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
         (void)algoNb;
         assert(algoNb == 1);
         return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize);
+                                cSrcSize, workSpace, wkspSize, flags);
 #else
         return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize):
+                                cSrcSize, workSpace, wkspSize, flags):
                         HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
-                                cSrcSize, workSpace, wkspSize);
+                                cSrcSize, workSpace, wkspSize, flags);
 #endif
     }
 }
 
 
-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
 #if defined(HUF_FORCE_DECOMPRESS_X1)
     (void)dtd;
     assert(dtd.tableType == 0);
-    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
     (void)dtd;
     assert(dtd.tableType == 1);
-    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #else
-    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
-                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #endif
 }
 
 #ifndef HUF_FORCE_DECOMPRESS_X2
-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
 {
     const BYTE* ip = (const BYTE*) cSrc;
 
-    size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags);
     if (HUF_isError(hSize)) return hSize;
     if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
     ip += hSize; cSrcSize -= hSize;
 
-    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags);
 }
 #endif
 
-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags)
 {
     DTableDesc const dtd = HUF_getDTableDesc(DTable);
 #if defined(HUF_FORCE_DECOMPRESS_X1)
     (void)dtd;
     assert(dtd.tableType == 0);
-    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
     (void)dtd;
     assert(dtd.tableType == 1);
-    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #else
-    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
-                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags);
 #endif
 }
 
-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags)
 {
     /* validation checks */
     if (dstSize == 0) return ERROR(dstSize_tooSmall);
@@ -1726,15 +1928,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds
 #if defined(HUF_FORCE_DECOMPRESS_X1)
         (void)algoNb;
         assert(algoNb == 0);
-        return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
 #elif defined(HUF_FORCE_DECOMPRESS_X2)
         (void)algoNb;
         assert(algoNb == 1);
-        return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
 #else
-        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
-                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) :
+                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags);
 #endif
     }
 }
-
diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c
index dbbc7919de53..30ef65e1ab5c 100644
--- a/lib/zstd/decompress/zstd_ddict.c
+++ b/lib/zstd/decompress/zstd_ddict.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,12 +15,12 @@
 /*-*******************************************************
 *  Dependencies
 *********************************************************/
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customFree */
 #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
 #include "../common/cpu.h"         /* bmi2 */
 #include "../common/mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "zstd_decompress_internal.h"
 #include "zstd_ddict.h"
@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
         ZSTD_memcpy(internalBuffer, dict, dictSize);
     }
     ddict->dictSize = dictSize;
-    ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+    ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
 
     /* parse dictionary content */
     FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
 unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
 {
     if (ddict==NULL) return 0;
-    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+    return ddict->dictID;
 }
diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h
index 8c1a79d666f8..de459a0dacd1 100644
--- a/lib/zstd/decompress/zstd_ddict.h
+++ b/lib/zstd/decompress/zstd_ddict.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
index 6b3177c94711..c9cbc45f6ed9 100644
--- a/lib/zstd/decompress/zstd_decompress.c
+++ b/lib/zstd/decompress/zstd_decompress.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -53,13 +54,15 @@
 *  Dependencies
 *********************************************************/
 #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
+#include "../common/allocations.h"  /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */
+#include "../common/error_private.h"
+#include "../common/zstd_internal.h"  /* blockProperties_t */
 #include "../common/mem.h"         /* low level memory routines */
+#include "../common/bits.h"  /* ZSTD_highbit32 */
 #define FSE_STATIC_LINKING_ONLY
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include <linux/xxhash.h> /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */
-#include "../common/zstd_internal.h"  /* blockProperties_t */
 #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
 #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
 #include "zstd_decompress_block.h"   /* ZSTD_decompressBlock_internal */
@@ -72,11 +75,11 @@
  *************************************/
 
 #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4
-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3   /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
-                                                     * Currently, that means a 0.75 load factor.
-                                                     * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
-                                                     * the load factor of the ddict hash set.
-                                                     */
+#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3  /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float.
+                                                    * Currently, that means a 0.75 load factor.
+                                                    * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded
+                                                    * the load factor of the ddict hash set.
+                                                    */
 
 #define DDICT_HASHSET_TABLE_BASE_SIZE 64
 #define DDICT_HASHSET_RESIZE_FACTOR 2
@@ -237,6 +240,8 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx)
     dctx->outBufferMode = ZSTD_bm_buffered;
     dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum;
     dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict;
+    dctx->disableHufAsm = 0;
+    dctx->maxBlockSizeParam = 0;
 }
 
 static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
@@ -253,6 +258,7 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
     dctx->streamStage = zdss_init;
     dctx->noForwardProgress = 0;
     dctx->oversizedDuration = 0;
+    dctx->isFrameDecompression = 1;
 #if DYNAMIC_BMI2
     dctx->bmi2 = ZSTD_cpuSupportsBmi2();
 #endif
@@ -421,16 +427,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
  *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
  * @return : 0, `zfhPtr` is correctly filled,
  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
- *           or an error code, which can be tested using ZSTD_isError() */
+**           or an error code, which can be tested using ZSTD_isError() */
 size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
 {
     const BYTE* ip = (const BYTE*)src;
     size_t const minInputSize = ZSTD_startingInputLength(format);
 
-    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
-    if (srcSize < minInputSize) return minInputSize;
-    RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
+    DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize);
+
+    if (srcSize > 0) {
+        /* note : technically could be considered an assert(), since it's an invalid entry */
+        RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0");
+    }
+    if (srcSize < minInputSize) {
+        if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) {
+            /* when receiving less than @minInputSize bytes,
+             * control these bytes at least correspond to a supported magic number
+             * in order to error out early if they don't.
+            **/
+            size_t const toCopy = MIN(4, srcSize);
+            unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER);
+            assert(src != NULL);
+            ZSTD_memcpy(hbuf, src, toCopy);
+            if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) {
+                /* not a zstd frame : let's check if it's a skippable frame */
+                MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START);
+                ZSTD_memcpy(hbuf, src, toCopy);
+                if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) {
+                    RETURN_ERROR(prefix_unknown,
+                                "first bytes don't correspond to any supported magic number");
+        }   }   }
+        return minInputSize;
+    }
 
+    ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */
     if ( (format != ZSTD_f_zstd1_magicless)
       && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
         if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
@@ -540,61 +570,62 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize)
     sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
     RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
                     frameParameter_unsupported, "");
-    {
-        size_t const skippableSize = skippableHeaderSize + sizeU32;
+    {   size_t const skippableSize = skippableHeaderSize + sizeU32;
         RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
         return skippableSize;
     }
 }
 
 /*! ZSTD_readSkippableFrame() :
- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer.
+ * Retrieves content of a skippable frame, and writes it to dst buffer.
  *
  * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written,
  * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the caller is not interested
  * in the magicVariant.
  *
- * Returns an error if destination buffer is not large enough, or if the frame is not skippable.
+ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame.
  *
  * @return : number of bytes written or a ZSTD error.
  */
-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant,
-                                            const void* src, size_t srcSize)
+size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity,
+                               unsigned* magicVariant,  /* optional, can be NULL */
+                         const void* src, size_t srcSize)
 {
-    U32 const magicNumber = MEM_readLE32(src);
-    size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
-    size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
-
-    /* check input validity */
-    RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
-    RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
-    RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
+    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
 
-    /* deliver payload */
-    if (skippableContentSize > 0  && dst != NULL)
-        ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
-    if (magicVariant != NULL)
-        *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
-    return skippableContentSize;
+    {   U32 const magicNumber = MEM_readLE32(src);
+        size_t skippableFrameSize = readSkippableFrameSize(src, srcSize);
+        size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE;
+
+        /* check input validity */
+        RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, "");
+        RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, "");
+        RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, "");
+
+        /* deliver payload */
+        if (skippableContentSize > 0  && dst != NULL)
+            ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize);
+        if (magicVariant != NULL)
+            *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+        return skippableContentSize;
+    }
 }
 
 /* ZSTD_findDecompressedSize() :
- *  compatible with legacy mode
  *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
  *      skippable frames
- *  @return : decompressed size of the frames contained */
+ *  note: compatible with legacy mode
+ * @return : decompressed size of the frames contained */
 unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
 {
-    unsigned long long totalDstSize = 0;
+    U64 totalDstSize = 0;
 
     while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) {
         U32 const magicNumber = MEM_readLE32(src);
 
         if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
             size_t const skippableSize = readSkippableFrameSize(src, srcSize);
-            if (ZSTD_isError(skippableSize)) {
-                return ZSTD_CONTENTSIZE_ERROR;
-            }
+            if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR;
             assert(skippableSize <= srcSize);
 
             src = (const BYTE *)src + skippableSize;
@@ -602,17 +633,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
             continue;
         }
 
-        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
-            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
+        {   unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize);
+            if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs;
 
-            /* check for overflow */
-            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
-            totalDstSize += ret;
+            if (U64_MAX - totalDstSize < fcs)
+                return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */
+            totalDstSize += fcs;
         }
+        /* skip to next frame */
         {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
-            if (ZSTD_isError(frameSrcSize)) {
-                return ZSTD_CONTENTSIZE_ERROR;
-            }
+            if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR;
+            assert(frameSrcSize <= srcSize);
 
             src = (const BYTE *)src + frameSrcSize;
             srcSize -= frameSrcSize;
@@ -676,13 +707,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret)
     return frameSizeInfo;
 }
 
-static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize)
+static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format)
 {
     ZSTD_frameSizeInfo frameSizeInfo;
     ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo));
 
 
-    if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
+    if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
         && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
         frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize);
         assert(ZSTD_isError(frameSizeInfo.compressedSize) ||
@@ -696,7 +727,7 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
         ZSTD_frameHeader zfh;
 
         /* Extract Frame Header */
-        {   size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize);
+        {   size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format);
             if (ZSTD_isError(ret))
                 return ZSTD_errorFrameSizeInfo(ret);
             if (ret > 0)
@@ -730,23 +761,26 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize
             ip += 4;
         }
 
+        frameSizeInfo.nbBlocks = nbBlocks;
         frameSizeInfo.compressedSize = (size_t)(ip - ipstart);
         frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
                                         ? zfh.frameContentSize
-                                        : nbBlocks * zfh.blockSizeMax;
+                                        : (unsigned long long)nbBlocks * zfh.blockSizeMax;
         return frameSizeInfo;
     }
 }
 
+static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) {
+    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format);
+    return frameSizeInfo.compressedSize;
+}
+
 /* ZSTD_findFrameCompressedSize() :
- *  compatible with legacy mode
- *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
- *  `srcSize` must be at least as large as the frame contained
- *  @return : the compressed size of the frame starting at `src` */
+ * See docs in zstd.h
+ * Note: compatible with legacy mode */
 size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
 {
-    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
-    return frameSizeInfo.compressedSize;
+    return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1);
 }
 
 /* ZSTD_decompressBound() :
@@ -760,7 +794,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
     unsigned long long bound = 0;
     /* Iterate over each frame */
     while (srcSize > 0) {
-        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
         size_t const compressedSize = frameSizeInfo.compressedSize;
         unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
         if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
@@ -773,6 +807,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
     return bound;
 }
 
+size_t ZSTD_decompressionMargin(void const* src, size_t srcSize)
+{
+    size_t margin = 0;
+    unsigned maxBlockSize = 0;
+
+    /* Iterate over each frame */
+    while (srcSize > 0) {
+        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1);
+        size_t const compressedSize = frameSizeInfo.compressedSize;
+        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+        ZSTD_frameHeader zfh;
+
+        FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), "");
+        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+            return ERROR(corruption_detected);
+
+        if (zfh.frameType == ZSTD_frame) {
+            /* Add the frame header to our margin */
+            margin += zfh.headerSize;
+            /* Add the checksum to our margin */
+            margin += zfh.checksumFlag ? 4 : 0;
+            /* Add 3 bytes per block */
+            margin += 3 * frameSizeInfo.nbBlocks;
+
+            /* Compute the max block size */
+            maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax);
+        } else {
+            assert(zfh.frameType == ZSTD_skippableFrame);
+            /* Add the entire skippable frame size to our margin. */
+            margin += compressedSize;
+        }
+
+        assert(srcSize >= compressedSize);
+        src = (const BYTE*)src + compressedSize;
+        srcSize -= compressedSize;
+    }
+
+    /* Add the max block size back to the margin. */
+    margin += maxBlockSize;
+
+    return margin;
+}
 
 /*-*************************************************************
  *   Frame decoding
@@ -856,6 +932,10 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
         ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize;
     }
 
+    /* Shrink the blockSizeMax if enabled */
+    if (dctx->maxBlockSizeParam != 0)
+        dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam);
+
     /* Loop on each block */
     while (1) {
         BYTE* oBlockEnd = oend;
@@ -888,7 +968,8 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
         switch(blockProperties.blockType)
         {
         case bt_compressed:
-            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming);
+            assert(dctx->isFrameDecompression == 1);
+            decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming);
             break;
         case bt_raw :
             /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */
@@ -901,12 +982,14 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
         default:
             RETURN_ERROR(corruption_detected, "invalid block type");
         }
-
-        if (ZSTD_isError(decodedSize)) return decodedSize;
-        if (dctx->validateChecksum)
+        FORWARD_IF_ERROR(decodedSize, "Block decompression failure");
+        DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize);
+        if (dctx->validateChecksum) {
             xxh64_update(&dctx->xxhState, op, decodedSize);
-        if (decodedSize != 0)
+        }
+        if (decodedSize) /* support dst = NULL,0 */ {
             op += decodedSize;
+        }
         assert(ip != NULL);
         ip += cBlockSize;
         remainingSrcSize -= cBlockSize;
@@ -930,12 +1013,15 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
     }
     ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0);
     /* Allow caller to get size read */
+    DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr);
     *srcPtr = ip;
     *srcSizePtr = remainingSrcSize;
     return (size_t)(op-ostart);
 }
 
-static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+static
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
+size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
                                         void* dst, size_t dstCapacity,
                                   const void* src, size_t srcSize,
                                   const void* dict, size_t dictSize,
@@ -955,17 +1041,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
     while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
 
 
-        {   U32 const magicNumber = MEM_readLE32(src);
-            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
-                        (unsigned)magicNumber, ZSTD_MAGICNUMBER);
+        if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) {
+            U32 const magicNumber = MEM_readLE32(src);
+            DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber);
             if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+                /* skippable frame detected : skip it */
                 size_t const skippableSize = readSkippableFrameSize(src, srcSize);
-                FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
+                FORWARD_IF_ERROR(skippableSize, "invalid skippable frame");
                 assert(skippableSize <= srcSize);
 
                 src = (const BYTE *)src + skippableSize;
                 srcSize -= skippableSize;
-                continue;
+                continue; /* check next frame */
         }   }
 
         if (ddict) {
@@ -1061,8 +1148,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr
 size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
 
 /*
- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed,
- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
+ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we
+ * allow taking a partial block as the input. Currently only raw uncompressed blocks can
  * be streamed.
  *
  * For blocks that can be streamed, this allows us to reduce the latency until we produce
@@ -1181,7 +1268,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
             {
             case bt_compressed:
                 DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
-                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming);
+                assert(dctx->isFrameDecompression == 1);
+                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming);
                 dctx->expected = 0;  /* Streaming not supported */
                 break;
             case bt_raw :
@@ -1250,6 +1338,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
     case ZSTDds_decodeSkippableHeader:
         assert(src != NULL);
         assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE);
+        assert(dctx->format != ZSTD_f_zstd1_magicless);
         ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize);   /* complete skippable header */
         dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
         dctx->stage = ZSTDds_skipFrame;
@@ -1262,7 +1351,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c
 
     default:
         assert(0);   /* impossible */
-        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
+        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
     }
 }
 
@@ -1303,11 +1392,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
         /* in minimal huffman, we always use X1 variants */
         size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
                                                 dictPtr, dictEnd - dictPtr,
-                                                workspace, workspaceSize);
+                                                workspace, workspaceSize, /* flags */ 0);
 #else
         size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
                                                 dictPtr, (size_t)(dictEnd - dictPtr),
-                                                workspace, workspaceSize);
+                                                workspace, workspaceSize, /* flags */ 0);
 #endif
         RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
         dictPtr += hSize;
@@ -1403,10 +1492,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
     dctx->prefixStart = NULL;
     dctx->virtualStart = NULL;
     dctx->dictEnd = NULL;
-    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+    dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001);  /* cover both little and big endian */
     dctx->litEntropy = dctx->fseEntropy = 0;
     dctx->dictID = 0;
     dctx->bType = bt_reserved;
+    dctx->isFrameDecompression = 1;
     ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
     ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue));  /* initial repcodes */
     dctx->LLTptr = dctx->entropy.LLTable;
@@ -1465,7 +1555,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
  *  This could for one of the following reasons :
  *  - The frame does not require a dictionary (most common case).
  *  - The frame was built with dictID intentionally removed.
- *    Needed dictionary is a hidden information.
+ *    Needed dictionary is a hidden piece of information.
  *    Note : this use case also happens when using a non-conformant dictionary.
  *  - `srcSize` is too small, and as a result, frame header could not be decoded.
  *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
@@ -1474,7 +1564,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
  *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
 unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
 {
-    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
+    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 };
     size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
     if (ZSTD_isError(hError)) return 0;
     return zfp.dictID;
@@ -1581,7 +1671,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di
 size_t ZSTD_initDStream(ZSTD_DStream* zds)
 {
     DEBUGLOG(4, "ZSTD_initDStream");
-    return ZSTD_initDStream_usingDDict(zds, NULL);
+    FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), "");
+    FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), "");
+    return ZSTD_startingInputLength(zds->format);
 }
 
 /* ZSTD_initDStream_usingDDict() :
@@ -1589,6 +1681,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds)
  * this function cannot fail */
 size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
 {
+    DEBUGLOG(4, "ZSTD_initDStream_usingDDict");
     FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
     FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
     return ZSTD_startingInputLength(dctx->format);
@@ -1599,6 +1692,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
  * this function cannot fail */
 size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
 {
+    DEBUGLOG(4, "ZSTD_resetDStream");
     FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
     return ZSTD_startingInputLength(dctx->format);
 }
@@ -1670,6 +1764,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
             bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict;
             bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts;
             return bounds;
+        case ZSTD_d_disableHuffmanAssembly:
+            bounds.lowerBound = 0;
+            bounds.upperBound = 1;
+            return bounds;
+        case ZSTD_d_maxBlockSize:
+            bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN;
+            bounds.upperBound = ZSTD_BLOCKSIZE_MAX;
+            return bounds;
+
         default:;
     }
     bounds.error = ERROR(parameter_unsupported);
@@ -1710,6 +1813,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value
         case ZSTD_d_refMultipleDDicts:
             *value = (int)dctx->refMultipleDDicts;
             return 0;
+        case ZSTD_d_disableHuffmanAssembly:
+            *value = (int)dctx->disableHufAsm;
+            return 0;
+        case ZSTD_d_maxBlockSize:
+            *value = dctx->maxBlockSizeParam;
+            return 0;
         default:;
     }
     RETURN_ERROR(parameter_unsupported, "");
@@ -1743,6 +1852,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value
             }
             dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value;
             return 0;
+        case ZSTD_d_disableHuffmanAssembly:
+            CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value);
+            dctx->disableHufAsm = value != 0;
+            return 0;
+        case ZSTD_d_maxBlockSize:
+            if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value);
+            dctx->maxBlockSizeParam = value;
+            return 0;
         default:;
     }
     RETURN_ERROR(parameter_unsupported, "");
@@ -1754,6 +1871,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset)
       || (reset == ZSTD_reset_session_and_parameters) ) {
         dctx->streamStage = zdss_init;
         dctx->noForwardProgress = 0;
+        dctx->isFrameDecompression = 1;
     }
     if ( (reset == ZSTD_reset_parameters)
       || (reset == ZSTD_reset_session_and_parameters) ) {
@@ -1770,11 +1888,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
     return ZSTD_sizeof_DCtx(dctx);
 }
 
-size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
+static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax)
 {
-    size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
-    /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/
-    unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2);
+    size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax);
+    /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block
+     * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing
+     * the block at the beginning of the output buffer, and maintain a full window.
+     *
+     * We need another blockSize worth of buffer so that we can store split
+     * literals at the end of the block without overwriting the extDict window.
+     */
+    unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2);
     unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
     size_t const minRBSize = (size_t) neededSize;
     RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize,
@@ -1782,6 +1906,11 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
     return minRBSize;
 }
 
+size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
+{
+    return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX);
+}
+
 size_t ZSTD_estimateDStreamSize(size_t windowSize)
 {
     size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
@@ -1918,7 +2047,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 if (zds->refMultipleDDicts && zds->ddictSet) {
                     ZSTD_DCtx_selectFrameDDict(zds);
                 }
-                DEBUGLOG(5, "header size : %u", (U32)hSize);
                 if (ZSTD_isError(hSize)) {
                     return hSize;   /* error */
                 }
@@ -1932,6 +2060,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                             zds->lhSize += remainingInput;
                         }
                         input->pos = input->size;
+                        /* check first few bytes */
+                        FORWARD_IF_ERROR(
+                            ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format),
+                            "First few bytes detected incorrect" );
+                        /* return hint input size */
                         return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
                     }
                     assert(ip != NULL);
@@ -1943,14 +2076,15 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
             if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
                 && zds->fParams.frameType != ZSTD_skippableFrame
                 && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
-                size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart));
+                size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format);
                 if (cSize <= (size_t)(iend-istart)) {
                     /* shortcut : using single-pass mode */
                     size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds));
                     if (ZSTD_isError(decompressedSize)) return decompressedSize;
-                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
+                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()");
+                    assert(istart != NULL);
                     ip = istart + cSize;
-                    op += decompressedSize;
+                    op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */
                     zds->expected = 0;
                     zds->streamStage = zdss_init;
                     someMoreWork = 0;
@@ -1969,7 +2103,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
             DEBUGLOG(4, "Consume header");
             FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), "");
 
-            if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
+            if (zds->format == ZSTD_f_zstd1
+                && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
                 zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
                 zds->stage = ZSTDds_skipFrame;
             } else {
@@ -1985,11 +2120,13 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
             zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
             RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize,
                             frameParameter_windowTooLarge, "");
+            if (zds->maxBlockSizeParam != 0)
+                zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam);
 
             /* Adapt buffer sizes to frame header instructions */
             {   size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
                 size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered
-                        ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize)
+                        ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax)
                         : 0;
 
                 ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize);
@@ -2034,6 +2171,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 }
                 if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
                     FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
+                    assert(ip != NULL);
                     ip += neededInSize;
                     /* Function modifies the stage so we must break */
                     break;
@@ -2048,7 +2186,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 int const isSkipFrame = ZSTD_isSkipFrame(zds);
                 size_t loadedSize;
                 /* At this point we shouldn't be decompressing a block that we can stream. */
-                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
+                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip)));
                 if (isSkipFrame) {
                     loadedSize = MIN(toLoad, (size_t)(iend-ip));
                 } else {
@@ -2057,8 +2195,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                                     "should never happen");
                     loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip));
                 }
-                ip += loadedSize;
-                zds->inPos += loadedSize;
+                if (loadedSize != 0) {
+                    /* ip may be NULL */
+                    ip += loadedSize;
+                    zds->inPos += loadedSize;
+                }
                 if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
 
                 /* decode loaded input */
@@ -2068,14 +2209,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
                 break;
             }
         case zdss_flush:
-            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
+            {
+                size_t const toFlushSize = zds->outEnd - zds->outStart;
                 size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize);
-                op += flushedSize;
+
+                op = op ? op + flushedSize : op;
+
                 zds->outStart += flushedSize;
                 if (flushedSize == toFlushSize) {  /* flush completed */
                     zds->streamStage = zdss_read;
                     if ( (zds->outBuffSize < zds->fParams.frameContentSize)
-                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                        && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
                         DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
                                 (int)(zds->outBuffSize - zds->outStart),
                                 (U32)zds->fParams.blockSizeMax);
@@ -2089,7 +2233,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
 
         default:
             assert(0);    /* impossible */
-            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
+            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compilers require default to do something */
     }   }
 
     /* result */
@@ -2102,8 +2246,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB
     if ((ip==istart) && (op==ostart)) {  /* no forward progress */
         zds->noForwardProgress ++;
         if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
-            RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
-            RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
+            RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, "");
+            RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, "");
             assert(0);
         }
     } else {
@@ -2140,11 +2284,17 @@ size_t ZSTD_decompressStream_simpleArgs (
                             void* dst, size_t dstCapacity, size_t* dstPos,
                       const void* src, size_t srcSize, size_t* srcPos)
 {
-    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
-    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
-    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
-    size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
-    *dstPos = output.pos;
-    *srcPos = input.pos;
-    return cErr;
+    ZSTD_outBuffer output;
+    ZSTD_inBuffer  input;
+    output.dst = dst;
+    output.size = dstCapacity;
+    output.pos = *dstPos;
+    input.src = src;
+    input.size = srcSize;
+    input.pos = *srcPos;
+    {   size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+        *dstPos = output.pos;
+        *srcPos = input.pos;
+        return cErr;
+    }
 }
diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
index c1913b8e7c89..9fe9a12c8a2c 100644
--- a/lib/zstd/decompress/zstd_decompress_block.c
+++ b/lib/zstd/decompress/zstd_decompress_block.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -20,12 +21,12 @@
 #include "../common/mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "../common/fse.h"
-#define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "../common/zstd_internal.h"
 #include "zstd_decompress_internal.h"   /* ZSTD_DCtx */
 #include "zstd_ddict.h"  /* ZSTD_DDictDictContent */
 #include "zstd_decompress_block.h"
+#include "../common/bits.h"  /* ZSTD_highbit32 */
 
 /*_*******************************************************
 *  Macros
@@ -51,6 +52,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); }
  *   Block decoding
  ***************************************************************/
 
+static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx)
+{
+    size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX;
+    assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX);
+    return blockSizeMax;
+}
+
 /*! ZSTD_getcBlockSize() :
  *  Provides the size of compressed block from block header `src` */
 size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
@@ -73,41 +81,49 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
 static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize,
     const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately)
 {
-    if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
-    {
-        /* room for litbuffer to fit without read faulting */
-        dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH;
+    size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
+    assert(litSize <= blockSizeMax);
+    assert(dctx->isFrameDecompression || streaming == not_streaming);
+    assert(expectedWriteSize <= blockSizeMax);
+    if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) {
+        /* If we aren't streaming, we can just put the literals after the output
+         * of the current block. We don't need to worry about overwriting the
+         * extDict of our window, because it doesn't exist.
+         * So if we have space after the end of the block, just put it there.
+         */
+        dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH;
         dctx->litBufferEnd = dctx->litBuffer + litSize;
         dctx->litBufferLocation = ZSTD_in_dst;
-    }
-    else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
-    {
-        /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
+    } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) {
+        /* Literals fit entirely within the extra buffer, put them there to avoid
+         * having to split the literals.
+         */
+        dctx->litBuffer = dctx->litExtraBuffer;
+        dctx->litBufferEnd = dctx->litBuffer + litSize;
+        dctx->litBufferLocation = ZSTD_not_in_dst;
+    } else {
+        assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE);
+        /* Literals must be split between the output block and the extra lit
+         * buffer. We fill the extra lit buffer with the tail of the literals,
+         * and put the rest of the literals at the end of the block, with
+         * WILDCOPY_OVERLENGTH of buffer room to allow for overreads.
+         * This MUST not write more than our maxBlockSize beyond dst, because in
+         * streaming mode, that could overwrite part of our extDict window.
+         */
         if (splitImmediately) {
             /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */
             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
             dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE;
-        }
-        else {
-            /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */
+        } else {
+            /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */
             dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize;
             dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize;
         }
         dctx->litBufferLocation = ZSTD_split;
-    }
-    else
-    {
-        /* fits entirely within litExtraBuffer, so no split is necessary */
-        dctx->litBuffer = dctx->litExtraBuffer;
-        dctx->litBufferEnd = dctx->litBuffer + litSize;
-        dctx->litBufferLocation = ZSTD_not_in_dst;
+        assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize);
     }
 }
 
-/* Hidden declaration for fullbench */
-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
-                          const void* src, size_t srcSize,
-                          void* dst, size_t dstCapacity, const streaming_operation streaming);
 /*! ZSTD_decodeLiteralsBlock() :
  * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored
  * in the dstBuffer.  If there is room to do so, it will be stored in full in the excess dst space after where the current
@@ -116,7 +132,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
  *
  * @return : nb of bytes read from src (< srcSize )
  *  note : symbol not declared but exposed for fullbench */
-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                           const void* src, size_t srcSize,   /* note : srcSize < BLOCKSIZE */
                           void* dst, size_t dstCapacity, const streaming_operation streaming)
 {
@@ -125,6 +141,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
 
     {   const BYTE* const istart = (const BYTE*) src;
         symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
+        size_t const blockSizeMax = ZSTD_blockSizeMax(dctx);
 
         switch(litEncType)
         {
@@ -134,13 +151,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
             ZSTD_FALLTHROUGH;
 
         case set_compressed:
-            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
+            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3");
             {   size_t lhSize, litSize, litCSize;
                 U32 singleStream=0;
                 U32 const lhlCode = (istart[0] >> 2) & 3;
                 U32 const lhc = MEM_readLE32(istart);
                 size_t hufSuccess;
-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
+                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
+                int const flags = 0
+                    | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0)
+                    | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0);
                 switch(lhlCode)
                 {
                 case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
@@ -164,7 +184,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     break;
                 }
                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
-                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
+                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
+                if (!singleStream)
+                    RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong,
+                        "Not enough literals (%zu) for the 4-streams mode (min %u)",
+                        litSize, MIN_LITERALS_FOR_4_STREAMS);
                 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
                 RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, "");
                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0);
@@ -176,13 +200,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
 
                 if (litEncType==set_repeat) {
                     if (singleStream) {
-                        hufSuccess = HUF_decompress1X_usingDTable_bmi2(
+                        hufSuccess = HUF_decompress1X_usingDTable(
                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
+                            dctx->HUFptr, flags);
                     } else {
-                        hufSuccess = HUF_decompress4X_usingDTable_bmi2(
+                        assert(litSize >= MIN_LITERALS_FOR_4_STREAMS);
+                        hufSuccess = HUF_decompress4X_usingDTable(
                             dctx->litBuffer, litSize, istart+lhSize, litCSize,
-                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
+                            dctx->HUFptr, flags);
                     }
                 } else {
                     if (singleStream) {
@@ -190,26 +215,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                         hufSuccess = HUF_decompress1X_DCtx_wksp(
                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
                             istart+lhSize, litCSize, dctx->workspace,
-                            sizeof(dctx->workspace));
+                            sizeof(dctx->workspace), flags);
 #else
-                        hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
+                        hufSuccess = HUF_decompress1X1_DCtx_wksp(
                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
                             istart+lhSize, litCSize, dctx->workspace,
-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
+                            sizeof(dctx->workspace), flags);
 #endif
                     } else {
-                        hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
+                        hufSuccess = HUF_decompress4X_hufOnly_wksp(
                             dctx->entropy.hufTable, dctx->litBuffer, litSize,
                             istart+lhSize, litCSize, dctx->workspace,
-                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx));
+                            sizeof(dctx->workspace), flags);
                     }
                 }
                 if (dctx->litBufferLocation == ZSTD_split)
                 {
+                    assert(litSize > ZSTD_LITBUFFEREXTRASIZE);
                     ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
                     ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
                     dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
                     dctx->litBufferEnd -= WILDCOPY_OVERLENGTH;
+                    assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax);
                 }
 
                 RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
@@ -224,7 +251,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
         case set_basic:
             {   size_t litSize, lhSize;
                 U32 const lhlCode = ((istart[0]) >> 2) & 3;
-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
+                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
                 switch(lhlCode)
                 {
                 case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
@@ -237,11 +264,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     break;
                 case 3:
                     lhSize = 3;
+                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3");
                     litSize = MEM_readLE24(istart) >> 4;
                     break;
                 }
 
                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
+                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
                 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
                 if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
@@ -270,7 +299,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
         case set_rle:
             {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
                 size_t litSize, lhSize;
-                size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity);
+                size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity);
                 switch(lhlCode)
                 {
                 case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
@@ -279,16 +308,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     break;
                 case 1:
                     lhSize = 2;
+                    RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3");
                     litSize = MEM_readLE16(istart) >> 4;
                     break;
                 case 3:
                     lhSize = 3;
+                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4");
                     litSize = MEM_readLE24(istart) >> 4;
-                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
                     break;
                 }
                 RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled");
-                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
+                RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, "");
                 RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, "");
                 ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1);
                 if (dctx->litBufferLocation == ZSTD_split)
@@ -310,6 +340,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
     }
 }
 
+/* Hidden declaration for fullbench */
+size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize,
+                          void* dst, size_t dstCapacity);
+size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize,
+                          void* dst, size_t dstCapacity)
+{
+    dctx->isFrameDecompression = 0;
+    return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming);
+}
+
 /* Default FSE distribution tables.
  * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
  * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions
@@ -506,14 +548,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
                 for (i = 8; i < n; i += 8) {
                     MEM_write64(spread + pos + i, sv);
                 }
-                pos += n;
+                assert(n>=0);
+                pos += (size_t)n;
             }
         }
         /* Now we spread those positions across the table.
-         * The benefit of doing it in two stages is that we avoid the the
+         * The benefit of doing it in two stages is that we avoid the
          * variable size inner loop, which caused lots of branch misses.
          * Now we can run through all the positions without any branch misses.
-         * We unroll the loop twice, since that is what emperically worked best.
+         * We unroll the loop twice, since that is what empirically worked best.
          */
         {
             size_t position = 0;
@@ -540,7 +583,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
             for (i=0; i<n; i++) {
                 tableDecode[position].baseValue = s;
                 position = (position + step) & tableMask;
-                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+                while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask;   /* lowprob area */
         }   }
         assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
     }
@@ -551,7 +594,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
         for (u=0; u<tableSize; u++) {
             U32 const symbol = tableDecode[u].baseValue;
             U32 const nextState = symbolNext[symbol]++;
-            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].nbBits = (BYTE) (tableLog - ZSTD_highbit32(nextState) );
             tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
             assert(nbAdditionalBits[symbol] < 255);
             tableDecode[u].nbAdditionalBits = nbAdditionalBits[symbol];
@@ -664,11 +707,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
 
     /* SeqHead */
     nbSeq = *ip++;
-    if (!nbSeq) {
-        *nbSeqPtr=0;
-        RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
-        return 1;
-    }
     if (nbSeq > 0x7F) {
         if (nbSeq == 0xFF) {
             RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
@@ -681,8 +719,16 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
     }
     *nbSeqPtr = nbSeq;
 
+    if (nbSeq == 0) {
+        /* No sequence : section ends immediately */
+        RETURN_ERROR_IF(ip != iend, corruption_detected,
+            "extraneous data present in the Sequences section");
+        return (size_t)(ip - istart);
+    }
+
     /* FSE table descriptors */
     RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
+    RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */
     {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
         symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
         symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
@@ -829,7 +875,7 @@ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, pt
 /* ZSTD_safecopyDstBeforeSrc():
  * This version allows overlap with dst before src, or handles the non-overlap case with dst after src
  * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */
-static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) {
+static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) {
     ptrdiff_t const diff = op - ip;
     BYTE* const oend = op + length;
 
@@ -858,6 +904,7 @@ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length
  * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
  */
 FORCE_NOINLINE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
 size_t ZSTD_execSequenceEnd(BYTE* op,
     BYTE* const oend, seq_t sequence,
     const BYTE** litPtr, const BYTE* const litLimit,
@@ -905,6 +952,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
  * This version is intended to be used during instances where the litBuffer is still split.  It is kept separate to avoid performance impact for the good case.
  */
 FORCE_NOINLINE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
 size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
     BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
     const BYTE** litPtr, const BYTE* const litLimit,
@@ -950,6 +998,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
 }
 
 HINT_INLINE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
 size_t ZSTD_execSequence(BYTE* op,
     BYTE* const oend, seq_t sequence,
     const BYTE** litPtr, const BYTE* const litLimit,
@@ -964,6 +1013,11 @@ size_t ZSTD_execSequence(BYTE* op,
 
     assert(op != NULL /* Precondition */);
     assert(oend_w < oend /* No underflow */);
+
+#if defined(__aarch64__)
+    /* prefetch sequence starting from match that will be used for copy later */
+    PREFETCH_L1(match);
+#endif
     /* Handle edge cases in a slow path:
      *   - Read beyond end of literals
      *   - Match end is within WILDCOPY_OVERLIMIT of oend
@@ -1043,6 +1097,7 @@ size_t ZSTD_execSequence(BYTE* op,
 }
 
 HINT_INLINE
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
 size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
     BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
     const BYTE** litPtr, const BYTE* const litLimit,
@@ -1154,7 +1209,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
 }
 
 /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
+ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32
  * bits before reloading. This value is the maximum number of bytes we read
  * after reloading when we are decoding long offsets.
  */
@@ -1165,13 +1220,37 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16
 
 typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
 
+/*
+ * ZSTD_decodeSequence():
+ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets
+ *                  only used in 32-bit mode
+ * @return : Sequence (litL + matchL + offset)
+ */
 FORCE_INLINE_TEMPLATE seq_t
-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
+ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq)
 {
     seq_t seq;
+    /*
+     * ZSTD_seqSymbol is a 64 bits wide structure.
+     * It can be loaded in one operation
+     * and its fields extracted by simply shifting or bit-extracting on aarch64.
+     * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh
+     * operations that cause performance drop. This can be avoided by using this
+     * ZSTD_memcpy hack.
+     */
+#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__))
+    ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS;
+    ZSTD_seqSymbol* const llDInfo = &llDInfoS;
+    ZSTD_seqSymbol* const mlDInfo = &mlDInfoS;
+    ZSTD_seqSymbol* const ofDInfo = &ofDInfoS;
+    ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol));
+    ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol));
+    ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol));
+#else
     const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state;
     const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state;
     const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state;
+#endif
     seq.matchLength = mlDInfo->baseValue;
     seq.litLength = llDInfo->baseValue;
     {   U32 const ofBase = ofDInfo->baseValue;
@@ -1186,28 +1265,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
         U32 const llnbBits = llDInfo->nbBits;
         U32 const mlnbBits = mlDInfo->nbBits;
         U32 const ofnbBits = ofDInfo->nbBits;
+
+        assert(llBits <= MaxLLBits);
+        assert(mlBits <= MaxMLBits);
+        assert(ofBits <= MaxOff);
         /*
          * As gcc has better branch and block analyzers, sometimes it is only
-         * valuable to mark likelyness for clang, it gives around 3-4% of
+         * valuable to mark likeliness for clang, it gives around 3-4% of
          * performance.
          */
 
         /* sequence */
         {   size_t offset;
-    #if defined(__clang__)
-            if (LIKELY(ofBits > 1)) {
-    #else
             if (ofBits > 1) {
-    #endif
                 ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
                 ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
-                assert(ofBits <= MaxOff);
+                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32);
+                ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits);
                 if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
-                    U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
+                    /* Always read extra bits, this keeps the logic simple,
+                     * avoids branches, and avoids accidentally reading 0 bits.
+                     */
+                    U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32;
                     offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
                     BIT_reloadDStream(&seqState->DStream);
-                    if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
-                    assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
+                    offset += BIT_readBitsFast(&seqState->DStream, extraBits);
                 } else {
                     offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
                     if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
@@ -1224,7 +1306,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
                 } else {
                     offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
                     {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
-                        temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
+                        temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */
                         if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
                         seqState->prevOffset[1] = seqState->prevOffset[0];
                         seqState->prevOffset[0] = offset = temp;
@@ -1232,11 +1314,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
             seq.offset = offset;
         }
 
-    #if defined(__clang__)
-        if (UNLIKELY(mlBits > 0))
-    #else
         if (mlBits > 0)
-    #endif
             seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
 
         if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
@@ -1246,11 +1324,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
         /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
         ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
 
-    #if defined(__clang__)
-        if (UNLIKELY(llBits > 0))
-    #else
         if (llBits > 0)
-    #endif
             seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
 
         if (MEM_32bits())
@@ -1259,17 +1333,22 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets)
         DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
                     (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
 
-        ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
-        ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
-        if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
-        ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
+        if (!isLastSeq) {
+            /* don't update FSE state for last Sequence */
+            ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits);    /* <=  9 bits */
+            ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits);    /* <=  9 bits */
+            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+            ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits);  /* <=  8 bits */
+            BIT_reloadDStream(&seqState->DStream);
+        }
     }
 
     return seq;
 }
 
-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+#if DEBUGLEVEL >= 1
+static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
 {
     size_t const windowSize = dctx->fParams.windowSize;
     /* No dictionary used. */
@@ -1283,30 +1362,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix
     /* Dictionary is active. */
     return 1;
 }
+#endif
 
-MEM_STATIC void ZSTD_assertValidSequence(
+static void ZSTD_assertValidSequence(
         ZSTD_DCtx const* dctx,
         BYTE const* op, BYTE const* oend,
         seq_t const seq,
         BYTE const* prefixStart, BYTE const* virtualStart)
 {
 #if DEBUGLEVEL >= 1
-    size_t const windowSize = dctx->fParams.windowSize;
-    size_t const sequenceSize = seq.litLength + seq.matchLength;
-    BYTE const* const oLitEnd = op + seq.litLength;
-    DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
-            (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
-    assert(op <= oend);
-    assert((size_t)(oend - op) >= sequenceSize);
-    assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
-    if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
-        size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
-        /* Offset must be within the dictionary. */
-        assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
-        assert(seq.offset <= windowSize + dictSize);
-    } else {
-        /* Offset must be within our window. */
-        assert(seq.offset <= windowSize);
+    if (dctx->isFrameDecompression) {
+        size_t const windowSize = dctx->fParams.windowSize;
+        size_t const sequenceSize = seq.litLength + seq.matchLength;
+        BYTE const* const oLitEnd = op + seq.litLength;
+        DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
+                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+        assert(op <= oend);
+        assert((size_t)(oend - op) >= sequenceSize);
+        assert(sequenceSize <= ZSTD_blockSizeMax(dctx));
+        if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
+            size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
+            /* Offset must be within the dictionary. */
+            assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
+            assert(seq.offset <= windowSize + dictSize);
+        } else {
+            /* Offset must be within our window. */
+            assert(seq.offset <= windowSize);
+        }
     }
 #else
     (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart;
@@ -1322,23 +1404,21 @@ DONT_VECTORIZE
 ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
                                void* dst, size_t maxDstSize,
                          const void* seqStart, size_t seqSize, int nbSeq,
-                         const ZSTD_longOffset_e isLongOffset,
-                         const int frame)
+                         const ZSTD_longOffset_e isLongOffset)
 {
     const BYTE* ip = (const BYTE*)seqStart;
     const BYTE* const iend = ip + seqSize;
     BYTE* const ostart = (BYTE*)dst;
-    BYTE* const oend = ostart + maxDstSize;
+    BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
     BYTE* op = ostart;
     const BYTE* litPtr = dctx->litPtr;
     const BYTE* litBufferEnd = dctx->litBufferEnd;
     const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
     const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
-    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
-    (void)frame;
+    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq);
 
-    /* Regen sequences */
+    /* Literals are split between internal buffer & output buffer */
     if (nbSeq) {
         seqState_t seqState;
         dctx->fseEntropy = 1;
@@ -1357,8 +1437,7 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
                 BIT_DStream_completed < BIT_DStream_overflow);
 
         /* decompress without overrunning litPtr begins */
-        {
-            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+        {   seq_t sequence = {0,0,0};  /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */
             /* Align the decompression loop to 32 + 16 bytes.
                 *
                 * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
@@ -1420,27 +1499,26 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
 #endif
 
             /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */
-            for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) {
-                size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+            for ( ; nbSeq; nbSeq--) {
+                sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
+                if (litPtr + sequence.litLength > dctx->litBufferEnd) break;
+                {   size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
-                assert(!ZSTD_isError(oneSeqSize));
-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+                    assert(!ZSTD_isError(oneSeqSize));
+                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
 #endif
-                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
-                    return oneSeqSize;
-                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
-                op += oneSeqSize;
-                if (UNLIKELY(!--nbSeq))
-                    break;
-                BIT_reloadDStream(&(seqState.DStream));
-                sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
-            }
+                    if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                        return oneSeqSize;
+                    DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+                    op += oneSeqSize;
+            }   }
+            DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)");
 
             /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */
             if (nbSeq > 0) {
                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
-                if (leftoverLit)
-                {
+                DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength);
+                if (leftoverLit) {
                     RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
                     ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
                     sequence.litLength -= leftoverLit;
@@ -1449,24 +1527,22 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
                 litPtr = dctx->litExtraBuffer;
                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
                 dctx->litBufferLocation = ZSTD_not_in_dst;
-                {
-                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
                     assert(!ZSTD_isError(oneSeqSize));
-                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+                    ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
 #endif
                     if (UNLIKELY(ZSTD_isError(oneSeqSize)))
                         return oneSeqSize;
                     DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
                     op += oneSeqSize;
-                    if (--nbSeq)
-                        BIT_reloadDStream(&(seqState.DStream));
                 }
+                nbSeq--;
             }
         }
 
-        if (nbSeq > 0) /* there is remaining lit from extra buffer */
-        {
+        if (nbSeq > 0) {
+            /* there is remaining lit from extra buffer */
 
 #if defined(__x86_64__)
             __asm__(".p2align 6");
@@ -1485,35 +1561,34 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
 #  endif
 #endif
 
-            for (; ; ) {
-                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+            for ( ; nbSeq ; nbSeq--) {
+                seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
                 size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
                 assert(!ZSTD_isError(oneSeqSize));
-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+                ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
 #endif
                 if (UNLIKELY(ZSTD_isError(oneSeqSize)))
                     return oneSeqSize;
                 DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
                 op += oneSeqSize;
-                if (UNLIKELY(!--nbSeq))
-                    break;
-                BIT_reloadDStream(&(seqState.DStream));
             }
         }
 
         /* check if reached exact end */
         DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq);
         RETURN_ERROR_IF(nbSeq, corruption_detected, "");
-        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
+        DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed);
+        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
         /* save reps for next block */
         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
     }
 
     /* last literal segment */
-    if (dctx->litBufferLocation == ZSTD_split)  /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
-    {
-        size_t const lastLLSize = litBufferEnd - litPtr;
+    if (dctx->litBufferLocation == ZSTD_split) {
+        /* split hasn't been reached yet, first get dst then copy litExtraBuffer */
+        size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
+        DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize);
         RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
         if (op != NULL) {
             ZSTD_memmove(op, litPtr, lastLLSize);
@@ -1523,15 +1598,17 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
         litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
         dctx->litBufferLocation = ZSTD_not_in_dst;
     }
-    {   size_t const lastLLSize = litBufferEnd - litPtr;
+    /* copy last literals from internal buffer */
+    {   size_t const lastLLSize = (size_t)(litBufferEnd - litPtr);
+        DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize);
         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
         if (op != NULL) {
             ZSTD_memcpy(op, litPtr, lastLLSize);
             op += lastLLSize;
-        }
-    }
+    }   }
 
-    return op-ostart;
+    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
+    return (size_t)(op - ostart);
 }
 
 FORCE_INLINE_TEMPLATE size_t
@@ -1539,21 +1616,19 @@ DONT_VECTORIZE
 ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
     void* dst, size_t maxDstSize,
     const void* seqStart, size_t seqSize, int nbSeq,
-    const ZSTD_longOffset_e isLongOffset,
-    const int frame)
+    const ZSTD_longOffset_e isLongOffset)
 {
     const BYTE* ip = (const BYTE*)seqStart;
     const BYTE* const iend = ip + seqSize;
     BYTE* const ostart = (BYTE*)dst;
-    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer;
+    BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer;
     BYTE* op = ostart;
     const BYTE* litPtr = dctx->litPtr;
     const BYTE* const litEnd = litPtr + dctx->litSize;
     const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart);
     const BYTE* const vBase = (const BYTE*)(dctx->virtualStart);
     const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd);
-    DEBUGLOG(5, "ZSTD_decompressSequences_body");
-    (void)frame;
+    DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq);
 
     /* Regen sequences */
     if (nbSeq) {
@@ -1568,11 +1643,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
         ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
         assert(dst != NULL);
 
-        ZSTD_STATIC_ASSERT(
-            BIT_DStream_unfinished < BIT_DStream_completed &&
-            BIT_DStream_endOfBuffer < BIT_DStream_completed &&
-            BIT_DStream_completed < BIT_DStream_overflow);
-
 #if defined(__x86_64__)
             __asm__(".p2align 6");
             __asm__("nop");
@@ -1587,73 +1657,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
 #  endif
 #endif
 
-        for ( ; ; ) {
-            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+        for ( ; nbSeq ; nbSeq--) {
+            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1);
             size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
             assert(!ZSTD_isError(oneSeqSize));
-            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+            ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
 #endif
             if (UNLIKELY(ZSTD_isError(oneSeqSize)))
                 return oneSeqSize;
             DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
             op += oneSeqSize;
-            if (UNLIKELY(!--nbSeq))
-                break;
-            BIT_reloadDStream(&(seqState.DStream));
         }
 
         /* check if reached exact end */
-        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
-        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
-        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
+        assert(nbSeq == 0);
+        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
         /* save reps for next block */
         { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
     }
 
     /* last literal segment */
-    {   size_t const lastLLSize = litEnd - litPtr;
+    {   size_t const lastLLSize = (size_t)(litEnd - litPtr);
+        DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize);
         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
         if (op != NULL) {
             ZSTD_memcpy(op, litPtr, lastLLSize);
             op += lastLLSize;
-        }
-    }
+    }   }
 
-    return op-ostart;
+    DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart));
+    return (size_t)(op - ostart);
 }
 
 static size_t
 ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
                                  void* dst, size_t maxDstSize,
                            const void* seqStart, size_t seqSize, int nbSeq,
-                           const ZSTD_longOffset_e isLongOffset,
-                           const int frame)
+                           const ZSTD_longOffset_e isLongOffset)
 {
-    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
 }
 
 static size_t
 ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
                                                void* dst, size_t maxDstSize,
                                          const void* seqStart, size_t seqSize, int nbSeq,
-                                         const ZSTD_longOffset_e isLongOffset,
-                                         const int frame)
+                                         const ZSTD_longOffset_e isLongOffset)
 {
-    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
 }
 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
 
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
 
-FORCE_INLINE_TEMPLATE size_t
-ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
+FORCE_INLINE_TEMPLATE
+
+size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
                    const BYTE* const prefixStart, const BYTE* const dictEnd)
 {
     prefetchPos += sequence.litLength;
     {   const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart;
-        const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
-                                                                              * No consequence though : memory address is only used for prefetching, not for dereferencing */
+        /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
+         * No consequence though : memory address is only used for prefetching, not for dereferencing */
+        const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset);
         PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE);   /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
     }
     return prefetchPos + sequence.matchLength;
@@ -1668,20 +1735,18 @@ ZSTD_decompressSequencesLong_body(
                                ZSTD_DCtx* dctx,
                                void* dst, size_t maxDstSize,
                          const void* seqStart, size_t seqSize, int nbSeq,
-                         const ZSTD_longOffset_e isLongOffset,
-                         const int frame)
+                         const ZSTD_longOffset_e isLongOffset)
 {
     const BYTE* ip = (const BYTE*)seqStart;
     const BYTE* const iend = ip + seqSize;
     BYTE* const ostart = (BYTE*)dst;
-    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize;
+    BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize);
     BYTE* op = ostart;
     const BYTE* litPtr = dctx->litPtr;
     const BYTE* litBufferEnd = dctx->litBufferEnd;
     const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
     const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
     const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
-    (void)frame;
 
     /* Regen sequences */
     if (nbSeq) {
@@ -1706,20 +1771,17 @@ ZSTD_decompressSequencesLong_body(
         ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
 
         /* prepare in advance */
-        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
-            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
+        for (seqNb=0; seqNb<seqAdvance; seqNb++) {
+            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
             prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
             sequences[seqNb] = sequence;
         }
-        RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
 
         /* decompress without stomping litBuffer */
-        for (; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb < nbSeq); seqNb++) {
-            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset);
-            size_t oneSeqSize;
+        for (; seqNb < nbSeq; seqNb++) {
+            seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset, seqNb == nbSeq-1);
 
-            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd)
-            {
+            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) {
                 /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */
                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
                 if (leftoverLit)
@@ -1732,26 +1794,26 @@ ZSTD_decompressSequencesLong_body(
                 litPtr = dctx->litExtraBuffer;
                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
                 dctx->litBufferLocation = ZSTD_not_in_dst;
-                oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
-                assert(!ZSTD_isError(oneSeqSize));
-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+                    assert(!ZSTD_isError(oneSeqSize));
+                    ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
 #endif
-                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                    if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
 
-                prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
-                sequences[seqNb & STORED_SEQS_MASK] = sequence;
-                op += oneSeqSize;
-            }
+                    prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd);
+                    sequences[seqNb & STORED_SEQS_MASK] = sequence;
+                    op += oneSeqSize;
+            }   }
             else
             {
                 /* lit buffer is either wholly contained in first or second split, or not split at all*/
-                oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
+                size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ?
                     ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) :
                     ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
                 assert(!ZSTD_isError(oneSeqSize));
-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+                ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
 #endif
                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
 
@@ -1760,17 +1822,15 @@ ZSTD_decompressSequencesLong_body(
                 op += oneSeqSize;
             }
         }
-        RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
+        RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, "");
 
         /* finish queue */
         seqNb -= seqAdvance;
         for ( ; seqNb<nbSeq ; seqNb++) {
             seq_t *sequence = &(sequences[seqNb&STORED_SEQS_MASK]);
-            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd)
-            {
+            if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) {
                 const size_t leftoverLit = dctx->litBufferEnd - litPtr;
-                if (leftoverLit)
-                {
+                if (leftoverLit) {
                     RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer");
                     ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
                     sequence->litLength -= leftoverLit;
@@ -1779,11 +1839,10 @@ ZSTD_decompressSequencesLong_body(
                 litPtr = dctx->litExtraBuffer;
                 litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
                 dctx->litBufferLocation = ZSTD_not_in_dst;
-                {
-                    size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
+                {   size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
                     assert(!ZSTD_isError(oneSeqSize));
-                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+                    ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
 #endif
                     if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
                     op += oneSeqSize;
@@ -1796,7 +1855,7 @@ ZSTD_decompressSequencesLong_body(
                     ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
                 assert(!ZSTD_isError(oneSeqSize));
-                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+                ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
 #endif
                 if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
                 op += oneSeqSize;
@@ -1808,8 +1867,7 @@ ZSTD_decompressSequencesLong_body(
     }
 
     /* last literal segment */
-    if (dctx->litBufferLocation == ZSTD_split)  /* first deplete literal buffer in dst, then copy litExtraBuffer */
-    {
+    if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */
         size_t const lastLLSize = litBufferEnd - litPtr;
         RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, "");
         if (op != NULL) {
@@ -1827,17 +1885,16 @@ ZSTD_decompressSequencesLong_body(
         }
     }
 
-    return op-ostart;
+    return (size_t)(op - ostart);
 }
 
 static size_t
 ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
                                  void* dst, size_t maxDstSize,
                            const void* seqStart, size_t seqSize, int nbSeq,
-                           const ZSTD_longOffset_e isLongOffset,
-                           const int frame)
+                           const ZSTD_longOffset_e isLongOffset)
 {
-    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
 }
 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
 
@@ -1851,20 +1908,18 @@ DONT_VECTORIZE
 ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
                                  void* dst, size_t maxDstSize,
                            const void* seqStart, size_t seqSize, int nbSeq,
-                           const ZSTD_longOffset_e isLongOffset,
-                           const int frame)
+                           const ZSTD_longOffset_e isLongOffset)
 {
-    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
 }
 static BMI2_TARGET_ATTRIBUTE size_t
 DONT_VECTORIZE
 ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
                                  void* dst, size_t maxDstSize,
                            const void* seqStart, size_t seqSize, int nbSeq,
-                           const ZSTD_longOffset_e isLongOffset,
-                           const int frame)
+                           const ZSTD_longOffset_e isLongOffset)
 {
-    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
 }
 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
 
@@ -1873,10 +1928,9 @@ static BMI2_TARGET_ATTRIBUTE size_t
 ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
                                  void* dst, size_t maxDstSize,
                            const void* seqStart, size_t seqSize, int nbSeq,
-                           const ZSTD_longOffset_e isLongOffset,
-                           const int frame)
+                           const ZSTD_longOffset_e isLongOffset)
 {
-    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
 }
 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
 
@@ -1886,37 +1940,34 @@ typedef size_t (*ZSTD_decompressSequences_t)(
                             ZSTD_DCtx* dctx,
                             void* dst, size_t maxDstSize,
                             const void* seqStart, size_t seqSize, int nbSeq,
-                            const ZSTD_longOffset_e isLongOffset,
-                            const int frame);
+                            const ZSTD_longOffset_e isLongOffset);
 
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
 static size_t
 ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
                    const void* seqStart, size_t seqSize, int nbSeq,
-                   const ZSTD_longOffset_e isLongOffset,
-                   const int frame)
+                   const ZSTD_longOffset_e isLongOffset)
 {
     DEBUGLOG(5, "ZSTD_decompressSequences");
 #if DYNAMIC_BMI2
     if (ZSTD_DCtx_get_bmi2(dctx)) {
-        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
     }
 #endif
-    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
 }
 static size_t
 ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
                                  const void* seqStart, size_t seqSize, int nbSeq,
-                                 const ZSTD_longOffset_e isLongOffset,
-                                 const int frame)
+                                 const ZSTD_longOffset_e isLongOffset)
 {
     DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
 #if DYNAMIC_BMI2
     if (ZSTD_DCtx_get_bmi2(dctx)) {
-        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
     }
 #endif
-    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
 }
 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
 
@@ -1931,69 +1982,114 @@ static size_t
 ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
                              void* dst, size_t maxDstSize,
                              const void* seqStart, size_t seqSize, int nbSeq,
-                             const ZSTD_longOffset_e isLongOffset,
-                             const int frame)
+                             const ZSTD_longOffset_e isLongOffset)
 {
     DEBUGLOG(5, "ZSTD_decompressSequencesLong");
 #if DYNAMIC_BMI2
     if (ZSTD_DCtx_get_bmi2(dctx)) {
-        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
     }
 #endif
-  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset);
 }
 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
 
 
+/*
+ * @returns The total size of the history referenceable by zstd, including
+ * both the prefix and the extDict. At @p op any offset larger than this
+ * is invalid.
+ */
+static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart)
+{
+    return (size_t)(op - virtualStart);
+}
+
+typedef struct {
+    unsigned longOffsetShare;
+    unsigned maxNbAdditionalBits;
+} ZSTD_OffsetInfo;
 
-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-/* ZSTD_getLongOffsetsShare() :
+/* ZSTD_getOffsetInfo() :
  * condition : offTable must be valid
  * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
- *           compared to maximum possible of (1<<OffFSELog) */
-static unsigned
-ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
+ *           compared to maximum possible of (1<<OffFSELog),
+ *           as well as the maximum number additional bits required.
+ */
+static ZSTD_OffsetInfo
+ZSTD_getOffsetInfo(const ZSTD_seqSymbol* offTable, int nbSeq)
 {
-    const void* ptr = offTable;
-    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
-    const ZSTD_seqSymbol* table = offTable + 1;
-    U32 const max = 1 << tableLog;
-    U32 u, total = 0;
-    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
-
-    assert(max <= (1 << OffFSELog));  /* max not too large */
-    for (u=0; u<max; u++) {
-        if (table[u].nbAdditionalBits > 22) total += 1;
+    ZSTD_OffsetInfo info = {0, 0};
+    /* If nbSeq == 0, then the offTable is uninitialized, but we have
+     * no sequences, so both values should be 0.
+     */
+    if (nbSeq != 0) {
+        const void* ptr = offTable;
+        U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+        const ZSTD_seqSymbol* table = offTable + 1;
+        U32 const max = 1 << tableLog;
+        U32 u;
+        DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+
+        assert(max <= (1 << OffFSELog));  /* max not too large */
+        for (u=0; u<max; u++) {
+            info.maxNbAdditionalBits = MAX(info.maxNbAdditionalBits, table[u].nbAdditionalBits);
+            if (table[u].nbAdditionalBits > 22) info.longOffsetShare += 1;
+        }
+
+        assert(tableLog <= OffFSELog);
+        info.longOffsetShare <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
     }
 
-    assert(tableLog <= OffFSELog);
-    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+    return info;
+}
 
-    return total;
+/*
+ * @returns The maximum offset we can decode in one read of our bitstream, without
+ * reloading more bits in the middle of the offset bits read. Any offsets larger
+ * than this must use the long offset decoder.
+ */
+static size_t ZSTD_maxShortOffset(void)
+{
+    if (MEM_64bits()) {
+        /* We can decode any offset without reloading bits.
+         * This might change if the max window size grows.
+         */
+        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+        return (size_t)-1;
+    } else {
+        /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1.
+         * This offBase would require STREAM_ACCUMULATOR_MIN extra bits.
+         * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset.
+         */
+        size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1;
+        size_t const maxOffset = maxOffbase - ZSTD_REP_NUM;
+        assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN);
+        return maxOffset;
+    }
 }
-#endif
 
 size_t
 ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
                               void* dst, size_t dstCapacity,
-                        const void* src, size_t srcSize, const int frame, const streaming_operation streaming)
+                        const void* src, size_t srcSize, const streaming_operation streaming)
 {   /* blockType == blockCompressed */
     const BYTE* ip = (const BYTE*)src;
-    /* isLongOffset must be true if there are long offsets.
-     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
-     * We don't expect that to be the case in 64-bit mode.
-     * In block mode, window size is not known, so we have to be conservative.
-     * (note: but it could be evaluated from current-lowLimit)
-     */
-    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
-    DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
-
-    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
+    DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize);
+
+    /* Note : the wording of the specification
+     * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx).
+     * This generally does not happen, as it makes little sense,
+     * since an uncompressed block would feature same size and have no decompression cost.
+     * Also, note that decoder from reference libzstd before < v1.5.4
+     * would consider this edge case as an error.
+     * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx)
+     * for broader compatibility with the deployed ecosystem of zstd decoders */
+    RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, "");
 
     /* Decode literals section */
     {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming);
-        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
+        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize);
         if (ZSTD_isError(litCSize)) return litCSize;
         ip += litCSize;
         srcSize -= litCSize;
@@ -2001,6 +2097,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
 
     /* Build Decoding Tables */
     {
+        /* Compute the maximum block size, which must also work when !frame and fParams are unset.
+         * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t.
+         */
+        size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx));
+        size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart);
+        /* isLongOffset must be true if there are long offsets.
+         * Offsets are long if they are larger than ZSTD_maxShortOffset().
+         * We don't expect that to be the case in 64-bit mode.
+         *
+         * We check here to see if our history is large enough to allow long offsets.
+         * If it isn't, then we can't possible have (valid) long offsets. If the offset
+         * is invalid, then it is okay to read it incorrectly.
+         *
+         * If isLongOffsets is true, then we will later check our decoding table to see
+         * if it is even possible to generate long offsets.
+         */
+        ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset()));
         /* These macros control at build-time which decompressor implementation
          * we use. If neither is defined, we do some inspection and dispatch at
          * runtime.
@@ -2008,6 +2121,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
         int usePrefetchDecoder = dctx->ddictIsCold;
+#else
+        /* Set to 1 to avoid computing offset info if we don't need to.
+         * Otherwise this value is ignored.
+         */
+        int usePrefetchDecoder = 1;
 #endif
         int nbSeq;
         size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
@@ -2015,40 +2133,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
         ip += seqHSize;
         srcSize -= seqHSize;
 
-        RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+        RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+        RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall,
+                "invalid dst");
 
-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
-    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-        if ( !usePrefetchDecoder
-          && (!frame || (dctx->fParams.windowSize > (1<<24)))
-          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
-            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
-            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
-            usePrefetchDecoder = (shareLongOffsets >= minShare);
+        /* If we could potentially have long offsets, or we might want to use the prefetch decoder,
+         * compute information about the share of long offsets, and the maximum nbAdditionalBits.
+         * NOTE: could probably use a larger nbSeq limit
+         */
+        if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) {
+            ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq);
+            if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) {
+                /* If isLongOffset, but the maximum number of additional bits that we see in our table is small
+                 * enough, then we know it is impossible to have too long an offset in this block, so we can
+                 * use the regular offset decoder.
+                 */
+                isLongOffset = ZSTD_lo_isRegularOffset;
+            }
+            if (!usePrefetchDecoder) {
+                U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+                usePrefetchDecoder = (info.longOffsetShare >= minShare);
+            }
         }
-#endif
 
         dctx->ddictIsCold = 0;
 
 #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
     !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
-        if (usePrefetchDecoder)
+        if (usePrefetchDecoder) {
+#else
+        (void)usePrefetchDecoder;
+        {
 #endif
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
-            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
+            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
 #endif
+        }
 
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
         /* else */
         if (dctx->litBufferLocation == ZSTD_split)
-            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
+            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
         else
-            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
+            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
 #endif
     }
 }
 
 
+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
 void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
 {
     if (dst != dctx->previousDstEnd && dstSize > 0) {   /* not contiguous */
@@ -2060,13 +2193,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize)
 }
 
 
-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
-                            void* dst, size_t dstCapacity,
-                      const void* src, size_t srcSize)
+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize)
 {
     size_t dSize;
+    dctx->isFrameDecompression = 0;
     ZSTD_checkContinuity(dctx, dst, dstCapacity);
-    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming);
+    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming);
+    FORWARD_IF_ERROR(dSize, "");
     dctx->previousDstEnd = (char*)dst + dSize;
     return dSize;
 }
+
+
+/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */
+size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
+}
diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
index 3d2d57a5d25a..becffbd89364 100644
--- a/lib/zstd/decompress/zstd_decompress_block.h
+++ b/lib/zstd/decompress/zstd_decompress_block.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -47,7 +48,7 @@ typedef enum {
  */
 size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
                                void* dst, size_t dstCapacity,
-                         const void* src, size_t srcSize, const int frame, const streaming_operation streaming);
+                         const void* src, size_t srcSize, const streaming_operation streaming);
 
 /* ZSTD_buildFSETable() :
  * generate FSE decoding table for one symbol (ll, ml or off)
@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
                    unsigned tableLog, void* wksp, size_t wkspSize,
                    int bmi2);
 
+/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */
+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize);
+
 
 #endif /* ZSTD_DEC_BLOCK_H */
diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
index 98102edb6a83..0f02526be774 100644
--- a/lib/zstd/decompress/zstd_decompress_internal.h
+++ b/lib/zstd/decompress/zstd_decompress_internal.h
@@ -1,5 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Yann Collet, Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = {
 
 #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64))
 #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32))
+#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12
 
 typedef struct {
     ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
     ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
     ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
-    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
+    HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)];  /* can accommodate HUF_decompress4X */
     U32 rep[ZSTD_REP_NUM];
     U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32];
 } ZSTD_entropyDTables_t;
@@ -152,6 +154,7 @@ struct ZSTD_DCtx_s
     size_t litSize;
     size_t rleSize;
     size_t staticSize;
+    int isFrameDecompression;
 #if DYNAMIC_BMI2 != 0
     int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
 #endif
@@ -164,6 +167,8 @@ struct ZSTD_DCtx_s
     ZSTD_dictUses_e dictUses;
     ZSTD_DDictHashSet* ddictSet;                    /* Hash set for multiple ddicts */
     ZSTD_refMultipleDDicts_e refMultipleDDicts;     /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */
+    int disableHufAsm;
+    int maxBlockSizeParam;
 
     /* streaming */
     ZSTD_dStreamStage streamStage;
diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h
index a06ca187aab5..8a47eb2a4514 100644
--- a/lib/zstd/decompress_sources.h
+++ b/lib/zstd/decompress_sources.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
index 22686e367e6f..466828e35752 100644
--- a/lib/zstd/zstd_common_module.c
+++ b/lib/zstd/zstd_common_module.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
 EXPORT_SYMBOL_GPL(ZSTD_isError);
 EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
 EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
-EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
-EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
-EXPORT_SYMBOL_GPL(ZSTD_customFree);
 
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_DESCRIPTION("Zstd Common");
diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c
index 04e1b5c01d9b..8ecf43226af2 100644
--- a/lib/zstd/zstd_compress_module.c
+++ b/lib/zstd/zstd_compress_module.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c
index f4ed952ed485..7d31518e9d5a 100644
--- a/lib/zstd/zstd_decompress_module.c
+++ b/lib/zstd/zstd_decompress_module.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
 /*
- * Copyright (c) Facebook, Inc.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  * All rights reserved.
  *
  * This source code is licensed under both the BSD-style license (found in the
@@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream);
 
 size_t zstd_reset_dstream(zstd_dstream *dstream)
 {
-	return ZSTD_resetDStream(dstream);
+	return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only);
 }
 EXPORT_SYMBOL(zstd_reset_dstream);
 
-- 
2.46.0