diff --git a/config b/config
index 6c60e62..247adc5 100644
--- a/config
+++ b/config
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/x86 6.5.0-rc7 Kernel Configuration
+# Linux/x86 6.5.0 Kernel Configuration
 #
 CONFIG_CC_VERSION_TEXT="gcc (GCC) 13.2.1 20230730"
 CONFIG_CC_IS_GCC=y
@@ -2894,7 +2894,6 @@ CONFIG_SCSI_DH_RDAC=m
 CONFIG_SCSI_DH_HP_SW=m
 CONFIG_SCSI_DH_EMC=m
 CONFIG_SCSI_DH_ALUA=m
-CONFIG_VHBA=m
 # end of SCSI device support
 
 CONFIG_ATA=y
diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch
index 2903879..9565648 100644
--- a/patches/0001-cachy-all.patch
+++ b/patches/0001-cachy-all.patch
@@ -1,13 +1,2145 @@
-From a7ef8b1848b3d53522882d36ef91ba3a6fcc619c Mon Sep 17 00:00:00 2001
+From de38719bf3e0937c83054c911c5cf102eae632dd Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Sun, 20 Aug 2023 15:52:45 +0200
-Subject: [PATCH 1/6] amd-pref-core
+Date: Mon, 28 Aug 2023 14:01:05 +0200
+Subject: [PATCH 1/7] amd-hdr
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h      |  71 ++
+ .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  34 +-
+ .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 100 +++
+ .../amd/display/amdgpu_dm/amdgpu_dm_color.c   | 805 ++++++++++++++++--
+ .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c    |  72 ++
+ .../amd/display/amdgpu_dm/amdgpu_dm_plane.c   | 224 ++++-
+ .../amd/display/dc/dcn10/dcn10_cm_common.c    |  95 ++-
+ .../drm/amd/display/dc/dcn30/dcn30_hwseq.c    |  37 +
+ .../drm/amd/display/dc/dcn30/dcn30_hwseq.h    |   3 +
+ .../drm/amd/display/dc/dcn301/dcn301_init.c   |   2 +-
+ .../gpu/drm/amd/display/include/fixed31_32.h  |  12 +
+ drivers/gpu/drm/arm/malidp_crtc.c             |   2 +-
+ drivers/gpu/drm/drm_atomic.c                  |   1 +
+ drivers/gpu/drm/drm_atomic_state_helper.c     |   1 +
+ drivers/gpu/drm/drm_atomic_uapi.c             |  43 +-
+ drivers/gpu/drm/drm_property.c                |  49 ++
+ include/drm/drm_mode_object.h                 |   2 +-
+ include/drm/drm_plane.h                       |   7 +
+ include/drm/drm_property.h                    |   6 +
+ include/uapi/drm/drm_mode.h                   |   8 +
+ 20 files changed, 1446 insertions(+), 128 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
+index 32fe05c810c6..84bf501b02f4 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
+@@ -343,6 +343,77 @@ struct amdgpu_mode_info {
+ 	int			disp_priority;
+ 	const struct amdgpu_display_funcs *funcs;
+ 	const enum drm_plane_type *plane_type;
++
++	/* Driver-private color mgmt props */
++
++	/* @plane_degamma_lut_property: Plane property to set a degamma LUT to
++	 * convert input space before blending.
++	 */
++	struct drm_property *plane_degamma_lut_property;
++	/* @plane_degamma_lut_size_property: Plane property to define the max
++	 * size of degamma LUT as supported by the driver (read-only).
++	 */
++	struct drm_property *plane_degamma_lut_size_property;
++	/**
++	 * @plane_degamma_tf_property: Plane pre-defined transfer function to
++	 * to go from scanout/encoded values to linear values.
++	 */
++	struct drm_property *plane_degamma_tf_property;
++	/**
++	 * @plane_hdr_mult_property:
++	 */
++	struct drm_property *plane_hdr_mult_property;
++
++	struct drm_property *plane_ctm_property;
++	/**
++	 * @shaper_lut_property: Plane property to set pre-blending shaper LUT
++	 * that converts color content before 3D LUT.
++	 */
++	struct drm_property *plane_shaper_lut_property;
++	/**
++	 * @shaper_lut_size_property: Plane property for the size of
++	 * pre-blending shaper LUT as supported by the driver (read-only).
++	 */
++	struct drm_property *plane_shaper_lut_size_property;
++	/**
++	 * @plane_shaper_tf_property: Plane property to set a predefined
++	 * transfer function for pre-blending shaper (before applying 3D LUT)
++	 * with or without LUT.
++	 */
++	struct drm_property *plane_shaper_tf_property;
++	/**
++	 * @plane_lut3d_property: Plane property for gamma correction using a
++	 * 3D LUT (pre-blending).
++	 */
++	struct drm_property *plane_lut3d_property;
++	/**
++	 * @plane_degamma_lut_size_property: Plane property to define the max
++	 * size of 3D LUT as supported by the driver (read-only).
++	 */
++	struct drm_property *plane_lut3d_size_property;
++	/**
++	 * @plane_blend_lut_property: Plane property for output gamma before
++	 * blending. Userspace set a blend LUT to convert colors after 3D LUT
++	 * conversion. It works as a post-3D LUT 1D LUT, with shaper LUT, they
++	 * are sandwiching 3D LUT with two 1D LUT.
++	 */
++	struct drm_property *plane_blend_lut_property;
++	/**
++	 * @plane_blend_lut_size_property: Plane property to define the max
++	 * size of blend LUT as supported by the driver (read-only).
++	 */
++	struct drm_property *plane_blend_lut_size_property;
++	/**
++	 * @plane_blend_tf_property: Plane property to set a predefined
++	 * transfer function for pre-blending blend (before applying 3D LUT)
++	 * with or without LUT.
++	 */
++	struct drm_property *plane_blend_tf_property;
++	/* @regamma_tf_property: Transfer function for CRTC regamma
++	 * (post-blending). Possible values are defined by `enum
++	 * amdgpu_transfer_function`.
++	 */
++	struct drm_property *regamma_tf_property;
+ };
+ 
+ #define AMDGPU_MAX_BL_LEVEL 0xFF
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+index e5554a36e8c8..43ef0e5f97ae 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+@@ -3943,6 +3943,11 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev)
+ 		return r;
+ 	}
+ 
++#ifdef AMD_PRIVATE_COLOR
++	if (amdgpu_dm_create_color_properties(adev))
++		return -ENOMEM;
++#endif
++
+ 	r = amdgpu_dm_audio_init(adev);
+ 	if (r) {
+ 		dc_release_state(state->context);
+@@ -4992,7 +4997,9 @@ static int fill_dc_plane_attributes(struct amdgpu_device *adev,
+ 	 * Always set input transfer function, since plane state is refreshed
+ 	 * every time.
+ 	 */
+-	ret = amdgpu_dm_update_plane_color_mgmt(dm_crtc_state, dc_plane_state);
++	ret = amdgpu_dm_update_plane_color_mgmt(dm_crtc_state,
++						plane_state,
++						dc_plane_state);
+ 	if (ret)
+ 		return ret;
+ 
+@@ -8007,6 +8014,10 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state,
+ 			bundle->surface_updates[planes_count].gamma = dc_plane->gamma_correction;
+ 			bundle->surface_updates[planes_count].in_transfer_func = dc_plane->in_transfer_func;
+ 			bundle->surface_updates[planes_count].gamut_remap_matrix = &dc_plane->gamut_remap_matrix;
++			bundle->surface_updates[planes_count].hdr_mult = dc_plane->hdr_mult;
++			bundle->surface_updates[planes_count].func_shaper = dc_plane->in_shaper_func;
++			bundle->surface_updates[planes_count].lut3d_func = dc_plane->lut3d_func;
++			bundle->surface_updates[planes_count].blend_tf = dc_plane->blend_tf;
+ 		}
+ 
+ 		amdgpu_dm_plane_fill_dc_scaling_info(dm->adev, new_plane_state,
+@@ -8215,6 +8226,10 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state,
+ 				&acrtc_state->stream->csc_color_matrix;
+ 			bundle->stream_update.out_transfer_func =
+ 				acrtc_state->stream->out_transfer_func;
++			bundle->stream_update.lut3d_func =
++				(struct dc_3dlut *) acrtc_state->stream->lut3d_func;
++			bundle->stream_update.func_shaper =
++				(struct dc_transfer_func *) acrtc_state->stream->func_shaper;
+ 		}
+ 
+ 		acrtc_state->stream->abm_level = acrtc_state->abm_level;
+@@ -9405,6 +9420,7 @@ static int dm_update_crtc_state(struct amdgpu_display_manager *dm,
+ 	 * when a modeset is needed, to ensure it gets reprogrammed.
+ 	 */
+ 	if (dm_new_crtc_state->base.color_mgmt_changed ||
++	    dm_old_crtc_state->regamma_tf != dm_new_crtc_state->regamma_tf ||
+ 	    drm_atomic_crtc_needs_modeset(new_crtc_state)) {
+ 		ret = amdgpu_dm_update_crtc_color_mgmt(dm_new_crtc_state);
+ 		if (ret)
+@@ -9472,6 +9488,10 @@ static bool should_reset_plane(struct drm_atomic_state *state,
+ 	 */
+ 	for_each_oldnew_plane_in_state(state, other, old_other_state, new_other_state, i) {
+ 		struct amdgpu_framebuffer *old_afb, *new_afb;
++		struct dm_plane_state *dm_new_other_state, *dm_old_other_state;
++
++		dm_new_other_state = to_dm_plane_state(new_other_state);
++		dm_old_other_state = to_dm_plane_state(old_other_state);
+ 
+ 		if (other->type == DRM_PLANE_TYPE_CURSOR)
+ 			continue;
+@@ -9508,6 +9528,18 @@ static bool should_reset_plane(struct drm_atomic_state *state,
+ 		    old_other_state->color_encoding != new_other_state->color_encoding)
+ 			return true;
+ 
++		/* HDR/Transfer Function changes. */
++		if (dm_old_other_state->degamma_tf != dm_new_other_state->degamma_tf ||
++		    dm_old_other_state->degamma_lut != dm_new_other_state->degamma_lut ||
++		    dm_old_other_state->hdr_mult != dm_new_other_state->hdr_mult ||
++		    dm_old_other_state->ctm != dm_new_other_state->ctm ||
++		    dm_old_other_state->shaper_lut != dm_new_other_state->shaper_lut ||
++		    dm_old_other_state->shaper_tf != dm_new_other_state->shaper_tf ||
++		    dm_old_other_state->lut3d != dm_new_other_state->lut3d ||
++		    dm_old_other_state->blend_lut != dm_new_other_state->blend_lut ||
++		    dm_old_other_state->blend_tf != dm_new_other_state->blend_tf)
++			return true;
++
+ 		/* Framebuffer checks fall at the end. */
+ 		if (!old_other_state->fb || !new_other_state->fb)
+ 			continue;
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+index 9fb5bb3a75a7..f92bbd7ed867 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
+@@ -51,6 +51,8 @@
+ 
+ #define AMDGPU_DMUB_NOTIFICATION_MAX 5
+ 
++#define AMDGPU_HDR_MULT_DEFAULT (0x100000000LL)
++
+ /*
+ #include "include/amdgpu_dal_power_if.h"
+ #include "amdgpu_dm_irq.h"
+@@ -702,9 +704,91 @@ static inline void amdgpu_dm_set_mst_status(uint8_t *status,
+ 
+ extern const struct amdgpu_ip_block_version dm_ip_block;
+ 
++enum amdgpu_transfer_function {
++	AMDGPU_TRANSFER_FUNCTION_DEFAULT,
++	AMDGPU_TRANSFER_FUNCTION_SRGB_EOTF,
++	AMDGPU_TRANSFER_FUNCTION_BT709_EOTF,
++	AMDGPU_TRANSFER_FUNCTION_PQ_EOTF,
++	AMDGPU_TRANSFER_FUNCTION_LINEAR,
++	AMDGPU_TRANSFER_FUNCTION_UNITY,
++	AMDGPU_TRANSFER_FUNCTION_GAMMA22_EOTF,
++	AMDGPU_TRANSFER_FUNCTION_GAMMA24_EOTF,
++	AMDGPU_TRANSFER_FUNCTION_GAMMA26_EOTF,
++	AMDGPU_TRANSFER_FUNCTION_SRGB_INV_EOTF,
++	AMDGPU_TRANSFER_FUNCTION_BT709_INV_EOTF,
++	AMDGPU_TRANSFER_FUNCTION_PQ_INV_EOTF,
++	AMDGPU_TRANSFER_FUNCTION_GAMMA22_INV_EOTF,
++	AMDGPU_TRANSFER_FUNCTION_GAMMA24_INV_EOTF,
++	AMDGPU_TRANSFER_FUNCTION_GAMMA26_INV_EOTF,
++        AMDGPU_TRANSFER_FUNCTION_COUNT
++};
++
+ struct dm_plane_state {
+ 	struct drm_plane_state base;
+ 	struct dc_plane_state *dc_state;
++
++	/* Plane color mgmt */
++	/**
++	 * @degamma_lut:
++	 *
++	 * 1D LUT for mapping framebuffer/plane pixel data before sampling or
++	 * blending operations. It's usually applied to linearize input space.
++	 * The blob (if not NULL) is an array of &struct drm_color_lut.
++	 */
++	struct drm_property_blob *degamma_lut;
++	/**
++	 * @degamma_tf:
++	 *
++	 * Predefined transfer function to tell DC driver the input space to
++	 * linearize.
++	 */
++	enum amdgpu_transfer_function degamma_tf;
++	/**
++	 * @hdr_mult:
++	 *
++	 * Multiplier to 'gain' the plane.  When PQ is decoded using the fixed
++	 * func transfer function to the internal FP16 fb, 1.0 -> 80 nits (on
++	 * AMD at least). When sRGB is decoded, 1.0 -> 1.0, obviously.
++	 * Therefore, 1.0 multiplier = 80 nits for SDR content.  So if you
++	 * want, 203 nits for SDR content, pass in (203.0 / 80.0).  Format is
++	 * S31.32 sign-magnitude.
++	 */
++	__u64 hdr_mult;
++	/**
++	 * @ctm:
++	 *
++	 * Color transformation matrix. See drm_crtc_enable_color_mgmt(). The
++	 * blob (if not NULL) is a &struct drm_color_ctm.
++	 */
++	struct drm_property_blob *ctm;
++	/**
++	 * @shaper_lut: shaper lookup table blob. The blob (if not NULL) is an
++	 * array of &struct drm_color_lut.
++	 */
++	struct drm_property_blob *shaper_lut;
++	/**
++	 * @shaper_tf:
++	 *
++	 * Predefined transfer function to delinearize color space.
++	 */
++	enum amdgpu_transfer_function shaper_tf;
++	/**
++	 * @lut3d: 3D lookup table blob. The blob (if not NULL) is an array of
++	 * &struct drm_color_lut.
++	 */
++	struct drm_property_blob *lut3d;
++	/**
++	 * @blend_lut: blend lut lookup table blob. The blob (if not NULL) is an
++	 * array of &struct drm_color_lut.
++	 */
++	struct drm_property_blob *blend_lut;
++	/**
++	 * @blend_tf:
++	 *
++	 * Pre-defined transfer function for converting plane pixel data before
++	 * applying blend LUT.
++	 */
++	enum amdgpu_transfer_function blend_tf;
+ };
+ 
+ struct dm_crtc_state {
+@@ -729,6 +813,14 @@ struct dm_crtc_state {
+ 	struct dc_info_packet vrr_infopacket;
+ 
+ 	int abm_level;
++
++        /**
++	 * @regamma_tf:
++	 *
++	 * Pre-defined transfer function for converting internal FB -> wire
++	 * encoding.
++	 */
++	enum amdgpu_transfer_function regamma_tf;
+ };
+ 
+ #define to_dm_crtc_state(x) container_of(x, struct dm_crtc_state, base)
+@@ -790,14 +882,22 @@ void amdgpu_dm_update_freesync_caps(struct drm_connector *connector,
+ 
+ void amdgpu_dm_trigger_timing_sync(struct drm_device *dev);
+ 
++/* 3D LUT max size is 17x17x17 */
++#define MAX_COLOR_3DLUT_ENTRIES 4913
++#define MAX_COLOR_3DLUT_BITDEPTH 12
++int amdgpu_dm_verify_lut3d_size(struct amdgpu_device *adev,
++				struct drm_plane_state *plane_state);
++/* 1D LUT size */
+ #define MAX_COLOR_LUT_ENTRIES 4096
+ /* Legacy gamm LUT users such as X doesn't like large LUT sizes */
+ #define MAX_COLOR_LEGACY_LUT_ENTRIES 256
+ 
+ void amdgpu_dm_init_color_mod(void);
++int amdgpu_dm_create_color_properties(struct amdgpu_device *adev);
+ int amdgpu_dm_verify_lut_sizes(const struct drm_crtc_state *crtc_state);
+ int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc);
+ int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc,
++				      struct drm_plane_state *plane_state,
+ 				      struct dc_plane_state *dc_plane_state);
+ 
+ void amdgpu_dm_update_connector_after_detect(
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
+index a4cb23d059bd..0a51af44efd5 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c
+@@ -72,6 +72,7 @@
+  */
+ 
+ #define MAX_DRM_LUT_VALUE 0xFFFF
++#define SDR_WHITE_LEVEL_INIT_VALUE 80
+ 
+ /**
+  * amdgpu_dm_init_color_mod - Initialize the color module.
+@@ -84,6 +85,213 @@ void amdgpu_dm_init_color_mod(void)
+ 	setup_x_points_distribution();
+ }
+ 
++#ifdef AMD_PRIVATE_COLOR
++/* Pre-defined Transfer Functions (TF)
++ *
++ * AMD driver supports pre-defined mathematical functions for transferring
++ * between encoded values and optical/linear space. Depending on HW color caps,
++ * ROMs and curves built by the AMD color module support these transforms.
++ *
++ * The driver-specific color implementation exposes properties for pre-blending
++ * degamma TF, shaper TF (before 3D LUT), and blend(dpp.ogam) TF and
++ * post-blending regamma (mpc.ogam) TF. However, only pre-blending degamma
++ * supports ROM curves. AMD color module uses pre-defined coefficients to build
++ * curves for the other blocks. What can be done by each color block is
++ * described by struct dpp_color_capsand struct mpc_color_caps.
++ *
++ * AMD driver-specific color API exposes the following pre-defined transfer
++ * functions:
++ *
++ * - Linear/Unity: linear/identity relationship between pixel value and
++ *   luminance value;
++ * - Gamma 2.2, Gamma 2.4, Gamma 2.6: pure gamma functions;
++ * - sRGB: 2.4 gamma with small initial linear section as standardized by IEC
++ *   61966-2-1:1999;
++ * - BT.709 (BT.1886): 2.4 gamma with differences in the dark end of the scale.
++ *   Used in HD-TV and standardized by ITU-R BT.1886;
++ * - PQ (Perceptual Quantizer): used for HDR display, allows luminance range
++ *   capability of 0 to 10,000 nits; standardized by SMPTE ST 2084.
++ *
++ * In the driver-specific API, color block names attached to TF properties
++ * suggest the intention regarding non-linear encoding pixel's luminance
++ * values. As some newer encodings don't use gamma curve, we make encoding and
++ * decoding explicit by defining an enum list of transfer functions supported
++ * in terms of EOTF and inverse EOTF, where:
++ *
++ * - EOTF (electro-optical transfer function): is the transfer function to go
++ *   from the encoded value to an optical (linear) value. De-gamma functions
++ *   traditionally do this.
++ * - Inverse EOTF (simply the inverse of the EOTF): is usually intended to go
++ *   from an optical/linear space (which might have been used for blending)
++ *   back to the encoded values. Gamma functions traditionally do this. 
++ */
++static const char * const
++amdgpu_transfer_function_names[] = {
++	[AMDGPU_TRANSFER_FUNCTION_DEFAULT]		= "Default",
++	[AMDGPU_TRANSFER_FUNCTION_LINEAR]		= "Linear",
++	[AMDGPU_TRANSFER_FUNCTION_UNITY]		= "Unity",
++	[AMDGPU_TRANSFER_FUNCTION_SRGB_EOTF]		= "sRGB EOTF",
++	[AMDGPU_TRANSFER_FUNCTION_BT709_EOTF]		= "BT.709 EOTF",
++	[AMDGPU_TRANSFER_FUNCTION_PQ_EOTF]		= "PQ EOTF",
++	[AMDGPU_TRANSFER_FUNCTION_GAMMA22_EOTF]		= "Gamma 2.2 EOTF",
++	[AMDGPU_TRANSFER_FUNCTION_GAMMA24_EOTF]		= "Gamma 2.4 EOTF",
++	[AMDGPU_TRANSFER_FUNCTION_GAMMA26_EOTF]		= "Gamma 2.6 EOTF",
++	[AMDGPU_TRANSFER_FUNCTION_SRGB_INV_EOTF]	= "sRGB inv_EOTF",
++	[AMDGPU_TRANSFER_FUNCTION_BT709_INV_EOTF]	= "BT.709 inv_EOTF",
++	[AMDGPU_TRANSFER_FUNCTION_PQ_INV_EOTF]		= "PQ inv_EOTF",
++	[AMDGPU_TRANSFER_FUNCTION_GAMMA22_INV_EOTF]	= "Gamma 2.2 inv_EOTF",
++	[AMDGPU_TRANSFER_FUNCTION_GAMMA24_INV_EOTF]	= "Gamma 2.4 inv_EOTF",
++	[AMDGPU_TRANSFER_FUNCTION_GAMMA26_INV_EOTF]	= "Gamma 2.6 inv_EOTF",
++};
++
++static const u32 amdgpu_eotf =
++	BIT(AMDGPU_TRANSFER_FUNCTION_SRGB_EOTF) |
++	BIT(AMDGPU_TRANSFER_FUNCTION_BT709_EOTF) |
++	BIT(AMDGPU_TRANSFER_FUNCTION_PQ_EOTF) |
++	BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA22_EOTF) |
++	BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA24_EOTF) |
++	BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA26_EOTF);
++
++static const u32 amdgpu_inv_eotf =
++	BIT(AMDGPU_TRANSFER_FUNCTION_SRGB_INV_EOTF) |
++	BIT(AMDGPU_TRANSFER_FUNCTION_BT709_INV_EOTF) |
++	BIT(AMDGPU_TRANSFER_FUNCTION_PQ_INV_EOTF) |
++	BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA22_INV_EOTF) |
++	BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA24_INV_EOTF) |
++	BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA26_INV_EOTF);
++
++static struct drm_property *
++amdgpu_create_tf_property(struct drm_device *dev,
++			  const char *name,
++			  u32 supported_tf)
++{
++	u32 transfer_functions = supported_tf |
++				 BIT(AMDGPU_TRANSFER_FUNCTION_DEFAULT) |
++				 BIT(AMDGPU_TRANSFER_FUNCTION_LINEAR) |
++				 BIT(AMDGPU_TRANSFER_FUNCTION_UNITY);
++	struct drm_prop_enum_list enum_list[AMDGPU_TRANSFER_FUNCTION_COUNT];
++	int i, len;
++
++	len = 0;
++	for (i = 0; i < AMDGPU_TRANSFER_FUNCTION_COUNT; i++) {
++		if ((transfer_functions & BIT(i)) == 0)
++			continue;
++
++		enum_list[len].type = i;
++		enum_list[len].name = amdgpu_transfer_function_names[i];
++		len++;
++	}
++
++	return drm_property_create_enum(dev, DRM_MODE_PROP_ENUM,
++					name, enum_list, len);
++}
++
++int
++amdgpu_dm_create_color_properties(struct amdgpu_device *adev)
++{
++	struct drm_property *prop;
++
++	prop = drm_property_create(adev_to_drm(adev),
++				   DRM_MODE_PROP_BLOB,
++				   "AMD_PLANE_DEGAMMA_LUT", 0);
++	if (!prop)
++		return -ENOMEM;
++	adev->mode_info.plane_degamma_lut_property = prop;
++
++	prop = drm_property_create_range(adev_to_drm(adev),
++					 DRM_MODE_PROP_IMMUTABLE,
++					 "AMD_PLANE_DEGAMMA_LUT_SIZE", 0, UINT_MAX);
++	if (!prop)
++		return -ENOMEM;
++	adev->mode_info.plane_degamma_lut_size_property = prop;
++
++	prop = amdgpu_create_tf_property(adev_to_drm(adev),
++					 "AMD_PLANE_DEGAMMA_TF",
++					 amdgpu_eotf);
++	if (!prop)
++		return -ENOMEM;
++	adev->mode_info.plane_degamma_tf_property = prop;
++
++	prop = drm_property_create_range(adev_to_drm(adev),
++					 0, "AMD_PLANE_HDR_MULT", 0, U64_MAX);
++	if (!prop)
++		return -ENOMEM;
++	adev->mode_info.plane_hdr_mult_property = prop;
++
++	prop = drm_property_create(adev_to_drm(adev),
++				   DRM_MODE_PROP_BLOB,
++				   "AMD_PLANE_CTM", 0);
++	if (!prop)
++		return -ENOMEM;
++	adev->mode_info.plane_ctm_property = prop;
++
++	prop = drm_property_create(adev_to_drm(adev),
++				   DRM_MODE_PROP_BLOB,
++				   "AMD_PLANE_SHAPER_LUT", 0);
++	if (!prop)
++		return -ENOMEM;
++	adev->mode_info.plane_shaper_lut_property = prop;
++
++	prop = drm_property_create_range(adev_to_drm(adev),
++					 DRM_MODE_PROP_IMMUTABLE,
++					 "AMD_PLANE_SHAPER_LUT_SIZE", 0, UINT_MAX);
++	if (!prop)
++		return -ENOMEM;
++	adev->mode_info.plane_shaper_lut_size_property = prop;
++
++	prop = amdgpu_create_tf_property(adev_to_drm(adev),
++					 "AMD_PLANE_SHAPER_TF",
++					 amdgpu_inv_eotf);
++	if (!prop)
++		return -ENOMEM;
++	adev->mode_info.plane_shaper_tf_property = prop;
++
++	prop = drm_property_create(adev_to_drm(adev),
++				   DRM_MODE_PROP_BLOB,
++				   "AMD_PLANE_LUT3D", 0);
++	if (!prop)
++		return -ENOMEM;
++	adev->mode_info.plane_lut3d_property = prop;
++
++	prop = drm_property_create_range(adev_to_drm(adev),
++					 DRM_MODE_PROP_IMMUTABLE,
++					 "AMD_PLANE_LUT3D_SIZE", 0, UINT_MAX);
++	if (!prop)
++		return -ENOMEM;
++	adev->mode_info.plane_lut3d_size_property = prop;
++
++	prop = drm_property_create(adev_to_drm(adev),
++				   DRM_MODE_PROP_BLOB,
++				   "AMD_PLANE_BLEND_LUT", 0);
++	if (!prop)
++		return -ENOMEM;
++	adev->mode_info.plane_blend_lut_property = prop;
++
++	prop = drm_property_create_range(adev_to_drm(adev),
++					 DRM_MODE_PROP_IMMUTABLE,
++					 "AMD_PLANE_BLEND_LUT_SIZE", 0, UINT_MAX);
++	if (!prop)
++		return -ENOMEM;
++	adev->mode_info.plane_blend_lut_size_property = prop;
++
++	prop = amdgpu_create_tf_property(adev_to_drm(adev),
++					 "AMD_PLANE_BLEND_TF",
++					 amdgpu_eotf);
++	if (!prop)
++		return -ENOMEM;
++	adev->mode_info.plane_blend_tf_property = prop;
++
++	prop = amdgpu_create_tf_property(adev_to_drm(adev),
++					 "AMD_CRTC_REGAMMA_TF",
++					 amdgpu_inv_eotf);
++	if (!prop)
++		return -ENOMEM;
++	adev->mode_info.regamma_tf_property = prop;
++
++	return 0;
++}
++#endif
++
+ /**
+  * __extract_blob_lut - Extracts the DRM lut and lut size from a blob.
+  * @blob: DRM color mgmt property blob
+@@ -182,7 +390,6 @@ static void __drm_lut_to_dc_gamma(const struct drm_color_lut *lut,
+ static void __drm_ctm_to_dc_matrix(const struct drm_color_ctm *ctm,
+ 				   struct fixed31_32 *matrix)
+ {
+-	int64_t val;
+ 	int i;
+ 
+ 	/*
+@@ -201,12 +408,33 @@ static void __drm_ctm_to_dc_matrix(const struct drm_color_ctm *ctm,
+ 		}
+ 
+ 		/* gamut_remap_matrix[i] = ctm[i - floor(i/4)] */
+-		val = ctm->matrix[i - (i / 4)];
+-		/* If negative, convert to 2's complement. */
+-		if (val & (1ULL << 63))
+-			val = -(val & ~(1ULL << 63));
++		matrix[i] = dc_fixpt_from_s3132(ctm->matrix[i - (i / 4)]);
++	}
++}
+ 
+-		matrix[i].value = val;
++/**
++ * __drm_ctm2_to_dc_matrix - converts a DRM CTM2 to a DC CSC float matrix
++ * @ctm: DRM color transformation matrix
++ * @matrix: DC CSC float matrix
++ *
++ * The matrix needs to be a 3x4 (12 entry) matrix.
++ */
++static void __drm_ctm2_to_dc_matrix(const struct drm_color_ctm2 *ctm,
++				   struct fixed31_32 *matrix)
++{
++	int i;
++
++	/*
++	 * DRM gives a 3x3 matrix, but DC wants 3x4. Assuming we're operating
++	 * with homogeneous coordinates, augment the matrix with 0's.
++	 *
++	 * The format provided is S31.32, using signed-magnitude representation.
++	 * Our fixed31_32 is also S31.32, but is using 2's complement. We have
++	 * to convert from signed-magnitude to 2's complement.
++	 */
++	for (i = 0; i < 12; i++) {
++		/* gamut_remap_matrix[i] = ctm[i - floor(i/4)] */
++		matrix[i] = dc_fixpt_from_s3132(ctm->matrix[i]);
+ 	}
+ }
+ 
+@@ -268,16 +496,18 @@ static int __set_output_tf(struct dc_transfer_func *func,
+ 	struct calculate_buffer cal_buffer = {0};
+ 	bool res;
+ 
+-	ASSERT(lut && lut_size == MAX_COLOR_LUT_ENTRIES);
+-
+ 	cal_buffer.buffer_index = -1;
+ 
+-	gamma = dc_create_gamma();
+-	if (!gamma)
+-		return -ENOMEM;
++	if (lut_size) {
++		ASSERT(lut && lut_size == MAX_COLOR_LUT_ENTRIES);
+ 
+-	gamma->num_entries = lut_size;
+-	__drm_lut_to_dc_gamma(lut, gamma, false);
++		gamma = dc_create_gamma();
++		if (!gamma)
++			return -ENOMEM;
++
++		gamma->num_entries = lut_size;
++		__drm_lut_to_dc_gamma(lut, gamma, false);
++	}
+ 
+ 	if (func->tf == TRANSFER_FUNCTION_LINEAR) {
+ 		/*
+@@ -285,27 +515,63 @@ static int __set_output_tf(struct dc_transfer_func *func,
+ 		 * on top of a linear input. But degamma params can be used
+ 		 * instead to simulate this.
+ 		 */
+-		gamma->type = GAMMA_CUSTOM;
++		if (gamma)
++			gamma->type = GAMMA_CUSTOM;
+ 		res = mod_color_calculate_degamma_params(NULL, func,
+-							gamma, true);
++							 gamma, gamma != NULL);
+ 	} else {
+ 		/*
+ 		 * Assume sRGB. The actual mapping will depend on whether the
+ 		 * input was legacy or not.
+ 		 */
+-		gamma->type = GAMMA_CS_TFM_1D;
+-		res = mod_color_calculate_regamma_params(func, gamma, false,
++		if (gamma)
++			gamma->type = GAMMA_CS_TFM_1D;
++		res = mod_color_calculate_regamma_params(func, gamma, gamma != NULL,
+ 							 has_rom, NULL, &cal_buffer);
+ 	}
+ 
+-	dc_gamma_release(&gamma);
++	if (gamma)
++		dc_gamma_release(&gamma);
+ 
+ 	return res ? 0 : -ENOMEM;
+ }
+ 
++static int amdgpu_dm_set_atomic_regamma(struct dc_stream_state *stream,
++					const struct drm_color_lut *regamma_lut,
++					uint32_t regamma_size, bool has_rom,
++					enum dc_transfer_func_predefined tf)
++{
++	struct dc_transfer_func *out_tf = stream->out_transfer_func;
++	int ret = 0;
++
++	if (regamma_size || tf != TRANSFER_FUNCTION_LINEAR) {
++		/* CRTC RGM goes into RGM LUT.
++		 *
++		 * Note: there is no implicit sRGB regamma here. We are using
++		 * degamma calculation from color module to calculate the curve
++		 * from a linear base.
++		 */
++		out_tf->type = TF_TYPE_DISTRIBUTED_POINTS;
++		out_tf->tf = tf;
++		out_tf->sdr_ref_white_level = SDR_WHITE_LEVEL_INIT_VALUE;
++
++		ret = __set_output_tf(out_tf, regamma_lut, regamma_size, has_rom);
++	} else {
++		/*
++		 * No CRTC RGM means we can just put the block into bypass
++		 * since we don't have any plane level adjustments using it.
++		 */
++		out_tf->type = TF_TYPE_BYPASS;
++		out_tf->tf = TRANSFER_FUNCTION_LINEAR;
++	}
++
++	return ret;
++}
++
+ /**
+  * __set_input_tf - calculates the input transfer function based on expected
+  * input space.
++ * @caps: dc color capabilities
+  * @func: transfer function
+  * @lut: lookup table that defines the color space
+  * @lut_size: size of respective lut.
+@@ -313,27 +579,249 @@ static int __set_output_tf(struct dc_transfer_func *func,
+  * Returns:
+  * 0 in case of success. -ENOMEM if fails.
+  */
+-static int __set_input_tf(struct dc_transfer_func *func,
++static int __set_input_tf(struct dc_color_caps *caps, struct dc_transfer_func *func,
+ 			  const struct drm_color_lut *lut, uint32_t lut_size)
+ {
+ 	struct dc_gamma *gamma = NULL;
+ 	bool res;
+ 
+-	gamma = dc_create_gamma();
+-	if (!gamma)
+-		return -ENOMEM;
++	if (lut_size) {
++		gamma = dc_create_gamma();
++		if (!gamma)
++			return -ENOMEM;
+ 
+-	gamma->type = GAMMA_CUSTOM;
+-	gamma->num_entries = lut_size;
++		gamma->type = GAMMA_CUSTOM;
++		gamma->num_entries = lut_size;
+ 
+-	__drm_lut_to_dc_gamma(lut, gamma, false);
++		__drm_lut_to_dc_gamma(lut, gamma, false);
++	}
+ 
+-	res = mod_color_calculate_degamma_params(NULL, func, gamma, true);
+-	dc_gamma_release(&gamma);
++	res = mod_color_calculate_degamma_params(caps, func, gamma, gamma != NULL);
++
++	if (gamma)
++		dc_gamma_release(&gamma);
+ 
+ 	return res ? 0 : -ENOMEM;
+ }
+ 
++static enum dc_transfer_func_predefined
++amdgpu_tf_to_dc_tf(enum amdgpu_transfer_function tf)
++{
++	switch (tf)
++	{
++	default:
++	case AMDGPU_TRANSFER_FUNCTION_DEFAULT:
++	case AMDGPU_TRANSFER_FUNCTION_LINEAR:
++		return TRANSFER_FUNCTION_LINEAR;
++	case AMDGPU_TRANSFER_FUNCTION_SRGB_EOTF:
++	case AMDGPU_TRANSFER_FUNCTION_SRGB_INV_EOTF:
++		return TRANSFER_FUNCTION_SRGB;
++	case AMDGPU_TRANSFER_FUNCTION_BT709_EOTF:
++	case AMDGPU_TRANSFER_FUNCTION_BT709_INV_EOTF:
++		return TRANSFER_FUNCTION_BT709;
++	case AMDGPU_TRANSFER_FUNCTION_PQ_EOTF:
++	case AMDGPU_TRANSFER_FUNCTION_PQ_INV_EOTF:
++		return TRANSFER_FUNCTION_PQ;
++	case AMDGPU_TRANSFER_FUNCTION_UNITY:
++		return TRANSFER_FUNCTION_UNITY;
++	case AMDGPU_TRANSFER_FUNCTION_GAMMA22_EOTF:
++	case AMDGPU_TRANSFER_FUNCTION_GAMMA22_INV_EOTF:
++		return TRANSFER_FUNCTION_GAMMA22;
++	case AMDGPU_TRANSFER_FUNCTION_GAMMA24_EOTF:
++	case AMDGPU_TRANSFER_FUNCTION_GAMMA24_INV_EOTF:
++		return TRANSFER_FUNCTION_GAMMA24;
++	case AMDGPU_TRANSFER_FUNCTION_GAMMA26_EOTF:
++	case AMDGPU_TRANSFER_FUNCTION_GAMMA26_INV_EOTF:
++		return TRANSFER_FUNCTION_GAMMA26;
++	}
++}
++
++static void __to_dc_lut3d_color(struct dc_rgb *rgb,
++				const struct drm_color_lut lut,
++				int bit_precision)
++{
++	rgb->red = drm_color_lut_extract(lut.red, bit_precision);
++	rgb->green = drm_color_lut_extract(lut.green, bit_precision);
++	rgb->blue  = drm_color_lut_extract(lut.blue, bit_precision);
++}
++
++static void __drm_3dlut_to_dc_3dlut(const struct drm_color_lut *lut,
++				    uint32_t lut3d_size,
++				    struct tetrahedral_params *params,
++				    bool use_tetrahedral_9,
++				    int bit_depth)
++{
++	struct dc_rgb *lut0;
++	struct dc_rgb *lut1;
++	struct dc_rgb *lut2;
++	struct dc_rgb *lut3;
++	int lut_i, i;
++
++
++	if (use_tetrahedral_9) {
++		lut0 = params->tetrahedral_9.lut0;
++		lut1 = params->tetrahedral_9.lut1;
++		lut2 = params->tetrahedral_9.lut2;
++		lut3 = params->tetrahedral_9.lut3;
++	} else {
++		lut0 = params->tetrahedral_17.lut0;
++		lut1 = params->tetrahedral_17.lut1;
++		lut2 = params->tetrahedral_17.lut2;
++		lut3 = params->tetrahedral_17.lut3;
++	}
++
++	for (lut_i = 0, i = 0; i < lut3d_size - 4; lut_i++, i += 4) {
++		/* We should consider the 3dlut RGB values are distributed
++		 * along four arrays lut0-3 where the first sizes 1229 and the
++		 * other 1228. The bit depth supported for 3dlut channel is
++		 * 12-bit, but DC also supports 10-bit.
++		 *
++		 * TODO: improve color pipeline API to enable the userspace set
++		 * bit depth and 3D LUT size/stride, as specified by VA-API.
++		 */
++		__to_dc_lut3d_color(&lut0[lut_i], lut[i], bit_depth);
++		__to_dc_lut3d_color(&lut1[lut_i], lut[i + 1], bit_depth);
++		__to_dc_lut3d_color(&lut2[lut_i], lut[i + 2], bit_depth);
++		__to_dc_lut3d_color(&lut3[lut_i], lut[i + 3], bit_depth);
++	}
++	/* lut0 has 1229 points (lut_size/4 + 1) */
++	__to_dc_lut3d_color(&lut0[lut_i], lut[i], bit_depth);
++}
++
++/* amdgpu_dm_atomic_lut3d - set DRM 3D LUT to DC stream
++ * @drm_lut3d: DRM CRTC (user) 3D LUT
++ * @drm_lut3d_size: size of 3D LUT
++ * @lut3d: DC 3D LUT
++ *
++ * Map DRM CRTC 3D LUT to DC 3D LUT and all necessary bits to program it
++ * on DCN MPC accordingly.
++ */
++static void amdgpu_dm_atomic_lut3d(const struct drm_color_lut *drm_lut,
++				   uint32_t drm_lut3d_size,
++				   struct dc_3dlut *lut)
++{
++	if (!drm_lut3d_size) {
++		lut->state.bits.initialized = 0;
++	} else {
++		/* Stride and bit depth are not programmable by API yet.
++		 * Therefore, only supports 17x17x17 3D LUT (12-bit).
++		 */
++		lut->lut_3d.use_tetrahedral_9 = false;
++		lut->lut_3d.use_12bits = true;
++		lut->state.bits.initialized = 1;
++		__drm_3dlut_to_dc_3dlut(drm_lut, drm_lut3d_size, &lut->lut_3d,
++					lut->lut_3d.use_tetrahedral_9,
++					MAX_COLOR_3DLUT_BITDEPTH);
++	}
++}
++
++static int amdgpu_dm_atomic_shaper_lut(const struct drm_color_lut *shaper_lut,
++				       bool has_rom,
++				       enum dc_transfer_func_predefined tf,
++				       uint32_t shaper_size,
++				       struct dc_transfer_func *func_shaper)
++{
++	int ret = 0;
++
++	if (shaper_size || tf != TRANSFER_FUNCTION_LINEAR) {
++		/* If DRM shaper LUT is set, we assume a linear color space
++		 * (linearized by DRM degamma 1D LUT or not)
++		 */
++		func_shaper->type = TF_TYPE_DISTRIBUTED_POINTS;
++		func_shaper->tf = tf;
++		func_shaper->sdr_ref_white_level = SDR_WHITE_LEVEL_INIT_VALUE;
++
++		ret = __set_output_tf(func_shaper, shaper_lut, shaper_size, has_rom);
++	} else {
++		func_shaper->type = TF_TYPE_BYPASS;
++		func_shaper->tf = TRANSFER_FUNCTION_LINEAR;
++	}
++
++	return ret;
++}
++
++static int amdgpu_dm_atomic_blend_lut(const struct drm_color_lut *blend_lut,
++				       bool has_rom,
++				       enum dc_transfer_func_predefined tf,
++				       uint32_t blend_size,
++				       struct dc_transfer_func *func_blend)
++{
++	int ret = 0;
++
++	if (blend_size || tf != TRANSFER_FUNCTION_LINEAR) {
++		/* DRM plane gamma LUT or TF means we are linearizing color
++		 * space before blending (similar to degamma programming). As
++		 * we don't have hardcoded curve support, or we use AMD color
++		 * module to fill the parameters that will be translated to HW
++		 * points.
++		 */
++		func_blend->type = TF_TYPE_DISTRIBUTED_POINTS;
++		func_blend->tf = tf;
++		func_blend->sdr_ref_white_level = SDR_WHITE_LEVEL_INIT_VALUE;
++
++		ret = __set_input_tf(NULL, func_blend, blend_lut, blend_size);
++	} else {
++		func_blend->type = TF_TYPE_BYPASS;
++		func_blend->tf = TRANSFER_FUNCTION_LINEAR;
++	}
++
++	return ret;
++}
++
++/* amdgpu_dm_lut3d_size - get expected size according to hw color caps
++ * @adev: amdgpu device
++ * @lut_size: default size
++ *
++ * Return:
++ * lut_size if DC 3D LUT is supported, zero otherwise.
++ */
++static uint32_t amdgpu_dm_get_lut3d_size(struct amdgpu_device *adev,
++					 uint32_t lut_size)
++{
++	return adev->dm.dc->caps.color.dpp.hw_3d_lut ? lut_size : 0;
++}
++
++/**
++ * amdgpu_dm_verify_lut3d_size - verifies if 3D LUT is supported and if DRM 3D
++ * LUT matches the hw supported size
++ * @adev: amdgpu device
++ * @crtc_state: the DRM CRTC state
++ *
++ * Verifies if post-blending (MPC) 3D LUT is supported by the HW (DCN 3.0 or
++ * newer) and if the DRM 3D LUT matches the supported size.
++ *
++ * Returns:
++ * 0 on success. -EINVAL if lut size are invalid.
++ */
++int amdgpu_dm_verify_lut3d_size(struct amdgpu_device *adev,
++				struct drm_plane_state *plane_state)
++{
++	struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state);
++	const struct drm_color_lut *shaper = NULL, *lut3d = NULL;
++	uint32_t exp_size, size;
++
++	/* shaper LUT is only available if 3D LUT color caps*/
++	exp_size = amdgpu_dm_get_lut3d_size(adev, MAX_COLOR_LUT_ENTRIES);
++	shaper = __extract_blob_lut(dm_plane_state->shaper_lut, &size);
++
++	if (shaper && size != exp_size) {
++		drm_dbg(&adev->ddev,
++			"Invalid Shaper LUT size. Should be %u but got %u.\n",
++			exp_size, size);
++	}
++
++	exp_size = amdgpu_dm_get_lut3d_size(adev, MAX_COLOR_3DLUT_ENTRIES);
++	lut3d = __extract_blob_lut(dm_plane_state->lut3d, &size);
++
++	if (lut3d && size != exp_size) {
++		drm_dbg(&adev->ddev, "Invalid 3D LUT size. Should be %u but got %u.\n",
++			exp_size, size);
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
+ /**
+  * amdgpu_dm_verify_lut_sizes - verifies if DRM luts match the hw supported sizes
+  * @crtc_state: the DRM CRTC state
+@@ -401,9 +889,12 @@ int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc)
+ 	const struct drm_color_lut *degamma_lut, *regamma_lut;
+ 	uint32_t degamma_size, regamma_size;
+ 	bool has_regamma, has_degamma;
++	enum dc_transfer_func_predefined tf = TRANSFER_FUNCTION_LINEAR;
+ 	bool is_legacy;
+ 	int r;
+ 
++	tf = amdgpu_tf_to_dc_tf(crtc->regamma_tf);
++
+ 	r = amdgpu_dm_verify_lut_sizes(&crtc->base);
+ 	if (r)
+ 		return r;
+@@ -440,26 +931,22 @@ int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc)
+ 		stream->out_transfer_func->type = TF_TYPE_DISTRIBUTED_POINTS;
+ 		stream->out_transfer_func->tf = TRANSFER_FUNCTION_SRGB;
+ 
++		/* Note: although we pass has_rom as parameter here, we never
++		 * actually use ROM because the color module only takes the ROM
++		 * path if transfer_func->type == PREDEFINED.
++		 *
++		 * See more in mod_color_calculate_regamma_params()
++		 */
+ 		r = __set_legacy_tf(stream->out_transfer_func, regamma_lut,
+ 				    regamma_size, has_rom);
+ 		if (r)
+ 			return r;
+-	} else if (has_regamma) {
+-		/* If atomic regamma, CRTC RGM goes into RGM LUT. */
+-		stream->out_transfer_func->type = TF_TYPE_DISTRIBUTED_POINTS;
+-		stream->out_transfer_func->tf = TRANSFER_FUNCTION_LINEAR;
+-
+-		r = __set_output_tf(stream->out_transfer_func, regamma_lut,
+-				    regamma_size, has_rom);
++	} else {
++		regamma_size = has_regamma ? regamma_size : 0;
++		r = amdgpu_dm_set_atomic_regamma(stream, regamma_lut,
++						 regamma_size, has_rom, tf);
+ 		if (r)
+ 			return r;
+-	} else {
+-		/*
+-		 * No CRTC RGM means we can just put the block into bypass
+-		 * since we don't have any plane level adjustments using it.
+-		 */
+-		stream->out_transfer_func->type = TF_TYPE_BYPASS;
+-		stream->out_transfer_func->tf = TRANSFER_FUNCTION_LINEAR;
+ 	}
+ 
+ 	/*
+@@ -495,20 +982,10 @@ int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc)
+ 	return 0;
+ }
+ 
+-/**
+- * amdgpu_dm_update_plane_color_mgmt: Maps DRM color management to DC plane.
+- * @crtc: amdgpu_dm crtc state
+- * @dc_plane_state: target DC surface
+- *
+- * Update the underlying dc_stream_state's input transfer function (ITF) in
+- * preparation for hardware commit. The transfer function used depends on
+- * the preparation done on the stream for color management.
+- *
+- * Returns:
+- * 0 on success. -ENOMEM if mem allocation fails.
+- */
+-int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc,
+-				      struct dc_plane_state *dc_plane_state)
++static int
++map_crtc_degamma_to_dc_plane(struct dm_crtc_state *crtc,
++			     struct dc_plane_state *dc_plane_state,
++			     struct dc_color_caps *caps)
+ {
+ 	const struct drm_color_lut *degamma_lut;
+ 	enum dc_transfer_func_predefined tf = TRANSFER_FUNCTION_SRGB;
+@@ -531,8 +1008,7 @@ int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc,
+ 						 &degamma_size);
+ 		ASSERT(degamma_size == MAX_COLOR_LUT_ENTRIES);
+ 
+-		dc_plane_state->in_transfer_func->type =
+-			TF_TYPE_DISTRIBUTED_POINTS;
++		dc_plane_state->in_transfer_func->type = TF_TYPE_DISTRIBUTED_POINTS;
+ 
+ 		/*
+ 		 * This case isn't fully correct, but also fairly
+@@ -564,11 +1040,11 @@ int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc,
+ 			dc_plane_state->in_transfer_func->tf =
+ 				TRANSFER_FUNCTION_LINEAR;
+ 
+-		r = __set_input_tf(dc_plane_state->in_transfer_func,
++		r = __set_input_tf(caps, dc_plane_state->in_transfer_func,
+ 				   degamma_lut, degamma_size);
+ 		if (r)
+ 			return r;
+-	} else if (crtc->cm_is_degamma_srgb) {
++	} else {
+ 		/*
+ 		 * For legacy gamma support we need the regamma input
+ 		 * in linear space. Assume that the input is sRGB.
+@@ -577,14 +1053,213 @@ int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc,
+ 		dc_plane_state->in_transfer_func->tf = tf;
+ 
+ 		if (tf != TRANSFER_FUNCTION_SRGB &&
+-		    !mod_color_calculate_degamma_params(NULL,
+-			    dc_plane_state->in_transfer_func, NULL, false))
++		    !mod_color_calculate_degamma_params(caps,
++							dc_plane_state->in_transfer_func,
++							NULL, false))
++			return -ENOMEM;
++	}
++
++	return 0;
++}
++
++static int
++__set_dm_plane_degamma(struct drm_plane_state *plane_state,
++		       struct dc_plane_state *dc_plane_state,
++		       struct dc_color_caps *color_caps)
++{
++	struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state);
++	const struct drm_color_lut *degamma_lut;
++	enum amdgpu_transfer_function tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT;
++	uint32_t degamma_size;
++	bool has_degamma_lut;
++	int ret;
++
++	degamma_lut = __extract_blob_lut(dm_plane_state->degamma_lut,
++					 &degamma_size);
++
++	has_degamma_lut = degamma_lut &&
++			  !__is_lut_linear(degamma_lut, degamma_size);
++
++	tf = dm_plane_state->degamma_tf;
++
++	/* If we don't have plane degamma LUT nor TF to set on DC, we have
++	 * nothing to do here, return.
++	 */
++	if (!has_degamma_lut && tf == AMDGPU_TRANSFER_FUNCTION_DEFAULT)
++		return -EINVAL;
++
++	dc_plane_state->in_transfer_func->tf = amdgpu_tf_to_dc_tf(tf);
++
++	if (has_degamma_lut) {
++		ASSERT(degamma_size == MAX_COLOR_LUT_ENTRIES);
++
++		dc_plane_state->in_transfer_func->type =
++			TF_TYPE_DISTRIBUTED_POINTS;
++
++		ret = __set_input_tf(color_caps, dc_plane_state->in_transfer_func,
++				     degamma_lut, degamma_size);
++		if (ret)
++			return ret;
++       } else {
++		dc_plane_state->in_transfer_func->type =
++			TF_TYPE_PREDEFINED;
++
++		if (!mod_color_calculate_degamma_params(color_caps,
++		    dc_plane_state->in_transfer_func, NULL, false))
+ 			return -ENOMEM;
+-	} else {
+-		/* ...Otherwise we can just bypass the DGM block. */
+-		dc_plane_state->in_transfer_func->type = TF_TYPE_BYPASS;
+-		dc_plane_state->in_transfer_func->tf = TRANSFER_FUNCTION_LINEAR;
++	}
++	return 0;
++}
++
++static int
++amdgpu_dm_plane_set_color_properties(struct drm_plane_state *plane_state,
++				     struct dc_plane_state *dc_plane_state,
++				     struct dc_color_caps *color_caps)
++{
++	struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state);
++	enum amdgpu_transfer_function shaper_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT;
++	enum amdgpu_transfer_function blend_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT;
++	const struct drm_color_lut *shaper_lut, *lut3d, *blend_lut;
++	uint32_t shaper_size, lut3d_size, blend_size;
++	int ret;
++
++	/* We have nothing to do here, return */
++	if (!plane_state->color_mgmt_changed)
++		return 0;
++
++	dc_plane_state->hdr_mult = dc_fixpt_from_s3132(dm_plane_state->hdr_mult);
++
++	shaper_lut = __extract_blob_lut(dm_plane_state->shaper_lut, &shaper_size);
++	shaper_size = shaper_lut != NULL ? shaper_size : 0;
++	shaper_tf = dm_plane_state->shaper_tf;
++	lut3d = __extract_blob_lut(dm_plane_state->lut3d, &lut3d_size);
++	lut3d_size = lut3d != NULL ? lut3d_size : 0;
++
++	amdgpu_dm_atomic_lut3d(lut3d, lut3d_size, dc_plane_state->lut3d_func);
++	ret = amdgpu_dm_atomic_shaper_lut(shaper_lut, false,
++					  amdgpu_tf_to_dc_tf(shaper_tf),
++					  shaper_size,
++					  dc_plane_state->in_shaper_func);
++	if (ret) {
++		drm_dbg_kms(plane_state->plane->dev,
++			    "setting plane %d shaper LUT failed.\n",
++			    plane_state->plane->index);
++
++		return ret;
++	}
++
++	blend_tf = dm_plane_state->blend_tf;
++	blend_lut = __extract_blob_lut(dm_plane_state->blend_lut, &blend_size);
++	blend_size = blend_lut != NULL ? blend_size : 0;
++
++	ret = amdgpu_dm_atomic_blend_lut(blend_lut, false,
++					 amdgpu_tf_to_dc_tf(blend_tf),
++					 blend_size, dc_plane_state->blend_tf);
++	if (ret) {
++		drm_dbg_kms(plane_state->plane->dev,
++			    "setting plane %d gamma lut failed.\n",
++			    plane_state->plane->index);
++
++		return ret;
+ 	}
+ 
+ 	return 0;
+ }
++
++/**
++ * amdgpu_dm_update_plane_color_mgmt: Maps DRM color management to DC plane.
++ * @crtc: amdgpu_dm crtc state
++ * @plane_state: DRM plane state
++ * @dc_plane_state: target DC surface
++ *
++ * Update the underlying dc_stream_state's input transfer function (ITF) in
++ * preparation for hardware commit. The transfer function used depends on
++ * the preparation done on the stream for color management.
++ *
++ * Returns:
++ * 0 on success. -ENOMEM if mem allocation fails.
++ */
++int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc,
++				      struct drm_plane_state *plane_state,
++				      struct dc_plane_state *dc_plane_state)
++{
++	struct amdgpu_device *adev = drm_to_adev(crtc->base.state->dev);
++	struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state);
++	struct drm_color_ctm2 *ctm = NULL;
++	struct dc_color_caps *color_caps = NULL;
++	bool has_crtc_cm_degamma;
++	int ret;
++
++	ret = amdgpu_dm_verify_lut3d_size(adev, plane_state);
++	if (ret) {
++		drm_dbg_driver(&adev->ddev, "amdgpu_dm_verify_lut3d_size() failed\n");
++		return ret;
++	}
++
++	if (dc_plane_state->ctx && dc_plane_state->ctx->dc)
++		color_caps = &dc_plane_state->ctx->dc->caps.color;
++
++	/* Initially, we can just bypass the DGM block. */
++	dc_plane_state->in_transfer_func->type = TF_TYPE_BYPASS;
++	dc_plane_state->in_transfer_func->tf = TRANSFER_FUNCTION_LINEAR;
++
++	/* After, we start to update values according to color props */
++	has_crtc_cm_degamma = (crtc->cm_has_degamma || crtc->cm_is_degamma_srgb);
++
++	ret = __set_dm_plane_degamma(plane_state, dc_plane_state, color_caps);
++	if (ret == -ENOMEM)
++		return ret;
++
++	/* We only have one degamma block available (pre-blending) for the
++	 * whole color correction pipeline, so that we can't actually perform
++	 * plane and CRTC degamma at the same time. Explicitly reject atomic
++	 * updates when userspace sets both plane and CRTC degamma properties.
++	 */
++	if (has_crtc_cm_degamma && ret != -EINVAL){
++		drm_dbg_kms(crtc->base.crtc->dev,
++			    "doesn't support plane and CRTC degamma at the same time\n");
++			return -EINVAL;
++	}
++
++	/* If we are here, it means we don't have plane degamma settings, check
++	 * if we have CRTC degamma waiting for mapping to pre-blending degamma
++	 * block
++	 */
++	if (has_crtc_cm_degamma) {
++		/* AMD HW doesn't have post-blending degamma caps. When DRM
++		 * CRTC atomic degamma is set, we maps it to DPP degamma block
++		 * (pre-blending) or, on legacy gamma, we use DPP degamma to
++		 * linearize (implicit degamma) from sRGB/BT709 according to
++		 * the input space.
++		 */
++		ret = map_crtc_degamma_to_dc_plane(crtc, dc_plane_state, color_caps);
++		if (ret)
++			return ret;
++	}
++
++	/* Setup CRTC CTM. */
++	if (dm_plane_state->ctm) {
++		ctm = (struct drm_color_ctm2 *)dm_plane_state->ctm->data;
++
++		/*
++		 * So far, if we have both plane and CRTC CTM, plane CTM takes
++		 * the priority and we discard data for CRTC CTM, as
++		 * implemented in dcn10_program_gamut_remap().  However, we
++		 * have MPC gamut_remap_matrix from DCN3 family, therefore we
++		 * can remap MPC programing of the matrix to MPC block and
++		 * provide support for both DPP and MPC matrix at the same
++		 * time.
++		 */
++		__drm_ctm2_to_dc_matrix(ctm, dc_plane_state->gamut_remap_matrix.matrix);
++
++		dc_plane_state->gamut_remap_matrix.enable_remap = true;
++		dc_plane_state->input_csc_color_matrix.enable_adjustment = false;
++	} else {
++		/* Bypass CTM. */
++		dc_plane_state->gamut_remap_matrix.enable_remap = false;
++		dc_plane_state->input_csc_color_matrix.enable_adjustment = false;
++	}
++
++	return amdgpu_dm_plane_set_color_properties(plane_state,
++						    dc_plane_state, color_caps);
++}
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
+index 30d4c6fd95f5..e7b38cce010c 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
+@@ -253,6 +253,7 @@ static struct drm_crtc_state *dm_crtc_duplicate_state(struct drm_crtc *crtc)
+ 	state->freesync_config = cur->freesync_config;
+ 	state->cm_has_degamma = cur->cm_has_degamma;
+ 	state->cm_is_degamma_srgb = cur->cm_is_degamma_srgb;
++	state->regamma_tf = cur->regamma_tf;
+ 	state->crc_skip_count = cur->crc_skip_count;
+ 	state->mpo_requested = cur->mpo_requested;
+ 	/* TODO Duplicate dc_stream after objects are stream object is flattened */
+@@ -289,6 +290,70 @@ static int amdgpu_dm_crtc_late_register(struct drm_crtc *crtc)
+ }
+ #endif
+ 
++#ifdef AMD_PRIVATE_COLOR
++/**
++ * drm_crtc_additional_color_mgmt - enable additional color properties
++ * @crtc: DRM CRTC
++ *
++ * This function lets the driver enable post-blending CRTC regamma transfer
++ * function property in addition to DRM CRTC gamma LUT. Default value means
++ * linear transfer function, which is the default CRTC gamma LUT behaviour
++ * without this property.
++ */
++static void
++dm_crtc_additional_color_mgmt(struct drm_crtc *crtc)
++{
++	struct amdgpu_device *adev = drm_to_adev(crtc->dev);
++
++	if(adev->dm.dc->caps.color.mpc.ogam_ram)
++		drm_object_attach_property(&crtc->base,
++					   adev->mode_info.regamma_tf_property,
++					   AMDGPU_TRANSFER_FUNCTION_DEFAULT);
++}
++
++static int
++amdgpu_dm_atomic_crtc_set_property(struct drm_crtc *crtc,
++				   struct drm_crtc_state *state,
++				   struct drm_property *property,
++				   uint64_t val)
++{
++	struct amdgpu_device *adev = drm_to_adev(crtc->dev);
++	struct dm_crtc_state *acrtc_state = to_dm_crtc_state(state);
++
++	if (property == adev->mode_info.regamma_tf_property) {
++		if (acrtc_state->regamma_tf != val) {
++			acrtc_state->regamma_tf = val;
++			acrtc_state->base.color_mgmt_changed |= 1;
++		}
++	} else {
++		drm_dbg_atomic(crtc->dev,
++			       "[CRTC:%d:%s] unknown property [PROP:%d:%s]]\n",
++			       crtc->base.id, crtc->name,
++			       property->base.id, property->name);
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int
++amdgpu_dm_atomic_crtc_get_property(struct drm_crtc *crtc,
++				   const struct drm_crtc_state *state,
++				   struct drm_property *property,
++				   uint64_t *val)
++{
++	struct amdgpu_device *adev = drm_to_adev(crtc->dev);
++	struct dm_crtc_state *acrtc_state = to_dm_crtc_state(state);
++
++	if (property == adev->mode_info.regamma_tf_property)
++		*val = acrtc_state->regamma_tf;
++	else
++		return -EINVAL;
++
++	return 0;
++}
++#endif
++
+ /* Implemented only the options currently available for the driver */
+ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = {
+ 	.reset = dm_crtc_reset_state,
+@@ -307,6 +372,10 @@ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = {
+ #if defined(CONFIG_DEBUG_FS)
+ 	.late_register = amdgpu_dm_crtc_late_register,
+ #endif
++#ifdef AMD_PRIVATE_COLOR
++	.atomic_set_property = amdgpu_dm_atomic_crtc_set_property,
++	.atomic_get_property = amdgpu_dm_atomic_crtc_get_property,
++#endif
+ };
+ 
+ static void dm_crtc_helper_disable(struct drm_crtc *crtc)
+@@ -482,6 +551,9 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm,
+ 
+ 	drm_mode_crtc_set_gamma_size(&acrtc->base, MAX_COLOR_LEGACY_LUT_ENTRIES);
+ 
++#ifdef AMD_PRIVATE_COLOR
++	dm_crtc_additional_color_mgmt(&acrtc->base);
++#endif
+ 	return 0;
+ 
+ fail:
+diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
+index 322668973747..60e5ffb1863d 100644
+--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c
+@@ -1317,8 +1317,14 @@ static void dm_drm_plane_reset(struct drm_plane *plane)
+ 	amdgpu_state = kzalloc(sizeof(*amdgpu_state), GFP_KERNEL);
+ 	WARN_ON(amdgpu_state == NULL);
+ 
+-	if (amdgpu_state)
+-		__drm_atomic_helper_plane_reset(plane, &amdgpu_state->base);
++	if (!amdgpu_state)
++		return;
++
++	__drm_atomic_helper_plane_reset(plane, &amdgpu_state->base);
++	amdgpu_state->degamma_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT;
++	amdgpu_state->hdr_mult = AMDGPU_HDR_MULT_DEFAULT;
++	amdgpu_state->shaper_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT;
++	amdgpu_state->blend_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT;
+ }
+ 
+ static struct drm_plane_state *
+@@ -1338,6 +1344,22 @@ dm_drm_plane_duplicate_state(struct drm_plane *plane)
+ 		dc_plane_state_retain(dm_plane_state->dc_state);
+ 	}
+ 
++	if (dm_plane_state->degamma_lut)
++		drm_property_blob_get(dm_plane_state->degamma_lut);
++	if (dm_plane_state->ctm)
++		drm_property_blob_get(dm_plane_state->ctm);
++	if (dm_plane_state->shaper_lut)
++		drm_property_blob_get(dm_plane_state->shaper_lut);
++	if (dm_plane_state->lut3d)
++		drm_property_blob_get(dm_plane_state->lut3d);
++	if (dm_plane_state->blend_lut)
++		drm_property_blob_get(dm_plane_state->blend_lut);
++
++	dm_plane_state->degamma_tf = old_dm_plane_state->degamma_tf;
++	dm_plane_state->hdr_mult = old_dm_plane_state->hdr_mult;
++	dm_plane_state->shaper_tf = old_dm_plane_state->shaper_tf;
++	dm_plane_state->blend_tf = old_dm_plane_state->blend_tf;
++
+ 	return &dm_plane_state->base;
+ }
+ 
+@@ -1405,12 +1427,203 @@ static void dm_drm_plane_destroy_state(struct drm_plane *plane,
+ {
+ 	struct dm_plane_state *dm_plane_state = to_dm_plane_state(state);
+ 
++	if (dm_plane_state->degamma_lut)
++		drm_property_blob_put(dm_plane_state->degamma_lut);
++	if (dm_plane_state->ctm)
++		drm_property_blob_put(dm_plane_state->ctm);
++	if (dm_plane_state->lut3d)
++		drm_property_blob_put(dm_plane_state->lut3d);
++	if (dm_plane_state->shaper_lut)
++		drm_property_blob_put(dm_plane_state->shaper_lut);
++	if (dm_plane_state->blend_lut)
++		drm_property_blob_put(dm_plane_state->blend_lut);
++
+ 	if (dm_plane_state->dc_state)
+ 		dc_plane_state_release(dm_plane_state->dc_state);
+ 
+ 	drm_atomic_helper_plane_destroy_state(plane, state);
+ }
+ 
++#ifdef AMD_PRIVATE_COLOR
++static void
++dm_atomic_plane_attach_color_mgmt_properties(struct amdgpu_display_manager *dm,
++					     struct drm_plane *plane)
++{
++	struct amdgpu_mode_info mode_info = dm->adev->mode_info;
++	struct dpp_color_caps dpp_color_caps = dm->dc->caps.color.dpp;
++
++	/* Check HW color pipeline capabilities for DPP (pre-blending) before expose*/
++	if (dpp_color_caps.dgam_ram || dpp_color_caps.gamma_corr) {
++		drm_object_attach_property(&plane->base,
++					   mode_info.plane_degamma_lut_property, 0);
++		drm_object_attach_property(&plane->base,
++					   mode_info.plane_degamma_lut_size_property,
++					   MAX_COLOR_LUT_ENTRIES);
++		drm_object_attach_property(&plane->base,
++					   dm->adev->mode_info.plane_degamma_tf_property,
++					   AMDGPU_TRANSFER_FUNCTION_DEFAULT);
++	}
++	/* HDR MULT is always available */
++	drm_object_attach_property(&plane->base,
++				   dm->adev->mode_info.plane_hdr_mult_property,
++				   AMDGPU_HDR_MULT_DEFAULT);
++
++	/* Only enable plane CTM if both DPP and MPC gamut remap is available. */
++	if (dm->dc->caps.color.mpc.gamut_remap)
++		drm_object_attach_property(&plane->base,
++					   dm->adev->mode_info.plane_ctm_property, 0);
++
++	if (dpp_color_caps.hw_3d_lut) {
++		drm_object_attach_property(&plane->base,
++					   mode_info.plane_shaper_lut_property, 0);
++		drm_object_attach_property(&plane->base,
++					   mode_info.plane_shaper_lut_size_property,
++					   MAX_COLOR_LUT_ENTRIES);
++		drm_object_attach_property(&plane->base,
++					   mode_info.plane_shaper_tf_property,
++					   AMDGPU_TRANSFER_FUNCTION_DEFAULT);
++		drm_object_attach_property(&plane->base,
++					   mode_info.plane_lut3d_property, 0);
++		drm_object_attach_property(&plane->base,
++					   mode_info.plane_lut3d_size_property,
++					   MAX_COLOR_3DLUT_ENTRIES);
++	}
++
++	if (dpp_color_caps.ogam_ram) {
++		drm_object_attach_property(&plane->base,
++					   mode_info.plane_blend_lut_property, 0);
++		drm_object_attach_property(&plane->base,
++					   mode_info.plane_blend_lut_size_property,
++					   MAX_COLOR_LUT_ENTRIES);
++		drm_object_attach_property(&plane->base,
++					   mode_info.plane_blend_tf_property,
++					   AMDGPU_TRANSFER_FUNCTION_DEFAULT);
++	}
++}
++
++static int
++dm_atomic_plane_set_property(struct drm_plane *plane,
++			     struct drm_plane_state *state,
++			     struct drm_property *property,
++			     uint64_t val)
++{
++	struct dm_plane_state *dm_plane_state = to_dm_plane_state(state);
++	struct amdgpu_device *adev = drm_to_adev(plane->dev);
++	bool replaced = false;
++	int ret;
++
++	if (property == adev->mode_info.plane_degamma_lut_property) {
++		ret = drm_property_replace_blob_from_id(plane->dev,
++							&dm_plane_state->degamma_lut,
++							val,
++							-1, sizeof(struct drm_color_lut),
++							&replaced);
++		dm_plane_state->base.color_mgmt_changed |= replaced;
++		return ret;
++	} else if (property == adev->mode_info.plane_degamma_tf_property) {
++		if (dm_plane_state->degamma_tf != val) {
++			dm_plane_state->degamma_tf = val;
++			dm_plane_state->base.color_mgmt_changed = 1;
++		}
++	} else if (property == adev->mode_info.plane_hdr_mult_property) {
++		if (dm_plane_state->hdr_mult != val) {
++			dm_plane_state->hdr_mult = val;
++			dm_plane_state->base.color_mgmt_changed = 1;
++		}
++	} else if (property == adev->mode_info.plane_ctm_property) {
++		ret = drm_property_replace_blob_from_id(plane->dev,
++							&dm_plane_state->ctm,
++							val,
++							sizeof(struct drm_color_ctm2), -1,
++							&replaced);
++		dm_plane_state->base.color_mgmt_changed |= replaced;
++		return ret;
++	} else if (property == adev->mode_info.plane_shaper_lut_property) {
++		ret = drm_property_replace_blob_from_id(plane->dev,
++							&dm_plane_state->shaper_lut,
++							val, -1,
++							sizeof(struct drm_color_lut),
++							&replaced);
++		dm_plane_state->base.color_mgmt_changed |= replaced;
++		return ret;
++	} else if (property == adev->mode_info.plane_shaper_tf_property) {
++		if (dm_plane_state->shaper_tf != val) {
++			dm_plane_state->shaper_tf = val;
++			dm_plane_state->base.color_mgmt_changed = 1;
++		}
++	} else if (property == adev->mode_info.plane_lut3d_property) {
++		ret = drm_property_replace_blob_from_id(plane->dev,
++							&dm_plane_state->lut3d,
++							val, -1,
++							sizeof(struct drm_color_lut),
++							&replaced);
++		dm_plane_state->base.color_mgmt_changed |= replaced;
++		return ret;
++	} else if (property == adev->mode_info.plane_blend_lut_property) {
++		ret = drm_property_replace_blob_from_id(plane->dev,
++							&dm_plane_state->blend_lut,
++							val, -1,
++							sizeof(struct drm_color_lut),
++							&replaced);
++		dm_plane_state->base.color_mgmt_changed |= replaced;
++		return ret;
++	} else if (property == adev->mode_info.plane_blend_tf_property) {
++		if (dm_plane_state->blend_tf != val) {
++			dm_plane_state->blend_tf = val;
++			dm_plane_state->base.color_mgmt_changed = 1;
++		}
++	} else {
++		drm_dbg_atomic(plane->dev,
++			       "[PLANE:%d:%s] unknown property [PROP:%d:%s]]\n",
++			       plane->base.id, plane->name,
++			       property->base.id, property->name);
++		return -EINVAL;
++	}
++
++	return 0;
++}
++
++static int
++dm_atomic_plane_get_property(struct drm_plane *plane,
++			     const struct drm_plane_state *state,
++			     struct drm_property *property,
++			     uint64_t *val)
++{
++	struct dm_plane_state *dm_plane_state = to_dm_plane_state(state);
++	struct amdgpu_device *adev = drm_to_adev(plane->dev);
++
++	if (property == adev->mode_info.plane_degamma_lut_property) {
++		*val = (dm_plane_state->degamma_lut) ?
++			dm_plane_state->degamma_lut->base.id : 0;
++	} else if (property == adev->mode_info.plane_degamma_tf_property) {
++		*val = dm_plane_state->degamma_tf;
++	} else if (property == adev->mode_info.plane_hdr_mult_property) {
++		*val = dm_plane_state->hdr_mult;
++	} else if (property == adev->mode_info.plane_ctm_property) {
++		*val = (dm_plane_state->ctm) ?
++			dm_plane_state->ctm->base.id : 0;
++	} else 	if (property == adev->mode_info.plane_shaper_lut_property) {
++		*val = (dm_plane_state->shaper_lut) ?
++			dm_plane_state->shaper_lut->base.id : 0;
++	} else if (property == adev->mode_info.plane_shaper_tf_property) {
++		*val = dm_plane_state->shaper_tf;
++	} else 	if (property == adev->mode_info.plane_lut3d_property) {
++		*val = (dm_plane_state->lut3d) ?
++			dm_plane_state->lut3d->base.id : 0;
++	} else 	if (property == adev->mode_info.plane_blend_lut_property) {
++		*val = (dm_plane_state->blend_lut) ?
++			dm_plane_state->blend_lut->base.id : 0;
++	} else if (property == adev->mode_info.plane_blend_tf_property) {
++		*val = dm_plane_state->blend_tf;
++
++	} else {
++		return -EINVAL;
++	}
++
++	return 0;
++}
++#endif
++
+ static const struct drm_plane_funcs dm_plane_funcs = {
+ 	.update_plane	= drm_atomic_helper_update_plane,
+ 	.disable_plane	= drm_atomic_helper_disable_plane,
+@@ -1419,6 +1632,10 @@ static const struct drm_plane_funcs dm_plane_funcs = {
+ 	.atomic_duplicate_state = dm_drm_plane_duplicate_state,
+ 	.atomic_destroy_state = dm_drm_plane_destroy_state,
+ 	.format_mod_supported = dm_plane_format_mod_supported,
++#ifdef AMD_PRIVATE_COLOR
++	.atomic_set_property = dm_atomic_plane_set_property,
++	.atomic_get_property = dm_atomic_plane_get_property,
++#endif
+ };
+ 
+ int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm,
+@@ -1489,6 +1706,9 @@ int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm,
+ 
+ 	drm_plane_helper_add(plane, &dm_plane_helper_funcs);
+ 
++#ifdef AMD_PRIVATE_COLOR
++	dm_atomic_plane_attach_color_mgmt_properties(dm, plane);
++#endif
+ 	/* Create (reset) the plane state */
+ 	if (plane->funcs->reset)
+ 		plane->funcs->reset(plane);
+diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c
+index 3538973bd0c6..04b2e04b68f3 100644
+--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c
++++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c
+@@ -349,20 +349,37 @@ bool cm_helper_translate_curve_to_hw_format(struct dc_context *ctx,
+ 		 * segment is from 2^-10 to 2^1
+ 		 * There are less than 256 points, for optimization
+ 		 */
+-		seg_distr[0] = 3;
+-		seg_distr[1] = 4;
+-		seg_distr[2] = 4;
+-		seg_distr[3] = 4;
+-		seg_distr[4] = 4;
+-		seg_distr[5] = 4;
+-		seg_distr[6] = 4;
+-		seg_distr[7] = 4;
+-		seg_distr[8] = 4;
+-		seg_distr[9] = 4;
+-		seg_distr[10] = 1;
+-
+-		region_start = -10;
+-		region_end = 1;
++		if (output_tf->tf == TRANSFER_FUNCTION_LINEAR) {
++			seg_distr[0] = 0; /* 2 */
++			seg_distr[1] = 1; /* 4 */
++			seg_distr[2] = 2; /* 4 */
++			seg_distr[3] = 3; /* 8 */
++			seg_distr[4] = 4; /* 16 */
++			seg_distr[5] = 5; /* 32 */
++			seg_distr[6] = 6; /* 64 */
++			seg_distr[7] = 7; /* 128 */
++
++			region_start = -8;
++			region_end = 1;
++		} else {
++			seg_distr[0] = 3; /* 8 */
++			seg_distr[1] = 4; /* 16 */
++			seg_distr[2] = 4;
++			seg_distr[3] = 4;
++			seg_distr[4] = 4;
++			seg_distr[5] = 4;
++			seg_distr[6] = 4;
++			seg_distr[7] = 4;
++			seg_distr[8] = 4;
++			seg_distr[9] = 4;
++			seg_distr[10] = 1; /* 2 */
++			/* total = 8*16 + 8 + 64 + 2 = */
++
++			region_start = -10;
++			region_end = 1;
++		}
++
++
+ 	}
+ 
+ 	for (i = region_end - region_start; i < MAX_REGIONS_NUMBER ; i++)
+@@ -375,16 +392,56 @@ bool cm_helper_translate_curve_to_hw_format(struct dc_context *ctx,
+ 
+ 	j = 0;
+ 	for (k = 0; k < (region_end - region_start); k++) {
+-		increment = NUMBER_SW_SEGMENTS / (1 << seg_distr[k]);
++		/*
++		 * We're using an ugly-ish hack here. Our HW allows for
++		 * 256 segments per region but SW_SEGMENTS is 16.
++		 * SW_SEGMENTS has some undocumented relationship to
++		 * the number of points in the tf_pts struct, which
++		 * is 512, unlike what's suggested TRANSFER_FUNC_POINTS.
++		 *
++		 * In order to work past this dilemma we'll scale our
++		 * increment by (1 << 4) and then do the inverse (1 >> 4)
++		 * when accessing the elements in tf_pts.
++		 *
++		 * TODO: find a better way using SW_SEGMENTS and
++		 *       TRANSFER_FUNC_POINTS definitions
++		 */
++		increment = (NUMBER_SW_SEGMENTS << 4) / (1 << seg_distr[k]);
+ 		start_index = (region_start + k + MAX_LOW_POINT) *
+ 				NUMBER_SW_SEGMENTS;
+-		for (i = start_index; i < start_index + NUMBER_SW_SEGMENTS;
++		for (i = (start_index << 4); i < (start_index << 4) + (NUMBER_SW_SEGMENTS << 4);
+ 				i += increment) {
++			struct fixed31_32 in_plus_one, in;
++			struct fixed31_32 value, red_value, green_value, blue_value;
++			uint32_t t = i & 0xf;
++
+ 			if (j == hw_points - 1)
+ 				break;
+-			rgb_resulted[j].red = output_tf->tf_pts.red[i];
+-			rgb_resulted[j].green = output_tf->tf_pts.green[i];
+-			rgb_resulted[j].blue = output_tf->tf_pts.blue[i];
++
++			in_plus_one = output_tf->tf_pts.red[(i >> 4) + 1];
++			in = output_tf->tf_pts.red[i >> 4];
++			value = dc_fixpt_sub(in_plus_one, in);
++			value = dc_fixpt_shr(dc_fixpt_mul_int(value, t),  4);
++			value = dc_fixpt_add(in, value);
++			red_value = value;
++
++			in_plus_one = output_tf->tf_pts.green[(i >> 4) + 1];
++			in = output_tf->tf_pts.green[i >> 4];
++			value = dc_fixpt_sub(in_plus_one, in);
++			value = dc_fixpt_shr(dc_fixpt_mul_int(value, t),  4);
++			value = dc_fixpt_add(in, value);
++			green_value = value;
++
++			in_plus_one = output_tf->tf_pts.blue[(i >> 4) + 1];
++			in = output_tf->tf_pts.blue[i >> 4];
++			value = dc_fixpt_sub(in_plus_one, in);
++			value = dc_fixpt_shr(dc_fixpt_mul_int(value, t),  4);
++			value = dc_fixpt_add(in, value);
++			blue_value = value;
++
++			rgb_resulted[j].red = red_value;
++			rgb_resulted[j].green = green_value;
++			rgb_resulted[j].blue = blue_value;
+ 			j++;
+ 		}
+ 	}
+diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c
+index bf8864bc8a99..72558eb877dc 100644
+--- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c
++++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c
+@@ -186,6 +186,43 @@ bool dcn30_set_input_transfer_func(struct dc *dc,
+ 	return result;
+ }
+ 
++void dcn30_program_gamut_remap(struct pipe_ctx *pipe_ctx)
++{
++	int i = 0;
++	struct dpp_grph_csc_adjustment dpp_adjust;
++	struct mpc_grph_gamut_adjustment mpc_adjust;
++	int mpcc_id = pipe_ctx->plane_res.hubp->inst;
++	struct mpc *mpc = pipe_ctx->stream_res.opp->ctx->dc->res_pool->mpc;
++
++	memset(&dpp_adjust, 0, sizeof(dpp_adjust));
++	dpp_adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_BYPASS;
++
++	if (pipe_ctx->plane_state &&
++	    pipe_ctx->plane_state->gamut_remap_matrix.enable_remap == true) {
++		dpp_adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_SW;
++		for (i = 0; i < CSC_TEMPERATURE_MATRIX_SIZE; i++)
++			dpp_adjust.temperature_matrix[i] =
++				pipe_ctx->plane_state->gamut_remap_matrix.matrix[i];
++	}
++
++	pipe_ctx->plane_res.dpp->funcs->dpp_set_gamut_remap(pipe_ctx->plane_res.dpp,
++							    &dpp_adjust);
++
++	memset(&mpc_adjust, 0, sizeof(mpc_adjust));
++	mpc_adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_BYPASS;
++
++	if (pipe_ctx->top_pipe == NULL) {
++		if (pipe_ctx->stream->gamut_remap_matrix.enable_remap == true) {
++			mpc_adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_SW;
++			for (i = 0; i < CSC_TEMPERATURE_MATRIX_SIZE; i++)
++				mpc_adjust.temperature_matrix[i] =
++					pipe_ctx->stream->gamut_remap_matrix.matrix[i];
++		}
++	}
++
++	mpc->funcs->set_gamut_remap(mpc, mpcc_id, &mpc_adjust);
++}
++
+ bool dcn30_set_output_transfer_func(struct dc *dc,
+ 				struct pipe_ctx *pipe_ctx,
+ 				const struct dc_stream_state *stream)
+diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h
+index a24a8e33a3d2..cb34ca932a5f 100644
+--- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h
++++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h
+@@ -58,6 +58,9 @@ bool dcn30_set_blend_lut(struct pipe_ctx *pipe_ctx,
+ bool dcn30_set_input_transfer_func(struct dc *dc,
+ 				struct pipe_ctx *pipe_ctx,
+ 				const struct dc_plane_state *plane_state);
++
++void dcn30_program_gamut_remap(struct pipe_ctx *pipe_ctx);
++
+ bool dcn30_set_output_transfer_func(struct dc *dc,
+ 				struct pipe_ctx *pipe_ctx,
+ 				const struct dc_stream_state *stream);
+diff --git a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c
+index 257df8660b4c..81fd50ee97c3 100644
+--- a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c
++++ b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c
+@@ -33,7 +33,7 @@
+ #include "dcn301_init.h"
+ 
+ static const struct hw_sequencer_funcs dcn301_funcs = {
+-	.program_gamut_remap = dcn10_program_gamut_remap,
++	.program_gamut_remap = dcn30_program_gamut_remap,
+ 	.init_hw = dcn10_init_hw,
+ 	.power_down_on_boot = dcn10_power_down_on_boot,
+ 	.apply_ctx_to_hw = dce110_apply_ctx_to_hw,
+diff --git a/drivers/gpu/drm/amd/display/include/fixed31_32.h b/drivers/gpu/drm/amd/display/include/fixed31_32.h
+index d4cf7ead1d87..84da1dd34efd 100644
+--- a/drivers/gpu/drm/amd/display/include/fixed31_32.h
++++ b/drivers/gpu/drm/amd/display/include/fixed31_32.h
+@@ -69,6 +69,18 @@ static const struct fixed31_32 dc_fixpt_epsilon = { 1LL };
+ static const struct fixed31_32 dc_fixpt_half = { 0x80000000LL };
+ static const struct fixed31_32 dc_fixpt_one = { 0x100000000LL };
+ 
++static inline struct fixed31_32 dc_fixpt_from_s3132(__u64 x)
++{
++	struct fixed31_32 val;
++
++	/* If negative, convert to 2's complement. */
++	if (x & (1ULL << 63))
++		x = -(x & ~(1ULL << 63));
++
++	val.value = x;
++	return val;
++}
++
+ /*
+  * @brief
+  * Initialization routines
+diff --git a/drivers/gpu/drm/arm/malidp_crtc.c b/drivers/gpu/drm/arm/malidp_crtc.c
+index dc01c43f6193..d72c22dcf685 100644
+--- a/drivers/gpu/drm/arm/malidp_crtc.c
++++ b/drivers/gpu/drm/arm/malidp_crtc.c
+@@ -221,7 +221,7 @@ static int malidp_crtc_atomic_check_ctm(struct drm_crtc *crtc,
+ 
+ 	/*
+ 	 * The size of the ctm is checked in
+-	 * drm_atomic_replace_property_blob_from_id.
++	 * drm_property_replace_blob_from_id.
+ 	 */
+ 	ctm = (struct drm_color_ctm *)state->ctm->data;
+ 	for (i = 0; i < ARRAY_SIZE(ctm->matrix); ++i) {
+diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c
+index c277b198fa3f..c3df45f90145 100644
+--- a/drivers/gpu/drm/drm_atomic.c
++++ b/drivers/gpu/drm/drm_atomic.c
+@@ -733,6 +733,7 @@ static void drm_atomic_plane_print_state(struct drm_printer *p,
+ 		   drm_get_color_encoding_name(state->color_encoding));
+ 	drm_printf(p, "\tcolor-range=%s\n",
+ 		   drm_get_color_range_name(state->color_range));
++	drm_printf(p, "\tcolor_mgmt_changed=%d\n", state->color_mgmt_changed);
+ 
+ 	if (plane->funcs->atomic_print_state)
+ 		plane->funcs->atomic_print_state(p, state);
+diff --git a/drivers/gpu/drm/drm_atomic_state_helper.c b/drivers/gpu/drm/drm_atomic_state_helper.c
+index 784e63d70a42..25bb0859fda7 100644
+--- a/drivers/gpu/drm/drm_atomic_state_helper.c
++++ b/drivers/gpu/drm/drm_atomic_state_helper.c
+@@ -338,6 +338,7 @@ void __drm_atomic_helper_plane_duplicate_state(struct drm_plane *plane,
+ 	state->fence = NULL;
+ 	state->commit = NULL;
+ 	state->fb_damage_clips = NULL;
++	state->color_mgmt_changed = false;
+ }
+ EXPORT_SYMBOL(__drm_atomic_helper_plane_duplicate_state);
+ 
+diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c
+index d867e7f9f2cd..a6a9ee5086dd 100644
+--- a/drivers/gpu/drm/drm_atomic_uapi.c
++++ b/drivers/gpu/drm/drm_atomic_uapi.c
+@@ -362,39 +362,6 @@ static s32 __user *get_out_fence_for_connector(struct drm_atomic_state *state,
+ 	return fence_ptr;
+ }
+ 
+-static int
+-drm_atomic_replace_property_blob_from_id(struct drm_device *dev,
+-					 struct drm_property_blob **blob,
+-					 uint64_t blob_id,
+-					 ssize_t expected_size,
+-					 ssize_t expected_elem_size,
+-					 bool *replaced)
+-{
+-	struct drm_property_blob *new_blob = NULL;
+-
+-	if (blob_id != 0) {
+-		new_blob = drm_property_lookup_blob(dev, blob_id);
+-		if (new_blob == NULL)
+-			return -EINVAL;
+-
+-		if (expected_size > 0 &&
+-		    new_blob->length != expected_size) {
+-			drm_property_blob_put(new_blob);
+-			return -EINVAL;
+-		}
+-		if (expected_elem_size > 0 &&
+-		    new_blob->length % expected_elem_size != 0) {
+-			drm_property_blob_put(new_blob);
+-			return -EINVAL;
+-		}
+-	}
+-
+-	*replaced |= drm_property_replace_blob(blob, new_blob);
+-	drm_property_blob_put(new_blob);
+-
+-	return 0;
+-}
+-
+ static int drm_atomic_crtc_set_property(struct drm_crtc *crtc,
+ 		struct drm_crtc_state *state, struct drm_property *property,
+ 		uint64_t val)
+@@ -415,7 +382,7 @@ static int drm_atomic_crtc_set_property(struct drm_crtc *crtc,
+ 	} else if (property == config->prop_vrr_enabled) {
+ 		state->vrr_enabled = val;
+ 	} else if (property == config->degamma_lut_property) {
+-		ret = drm_atomic_replace_property_blob_from_id(dev,
++		ret = drm_property_replace_blob_from_id(dev,
+ 					&state->degamma_lut,
+ 					val,
+ 					-1, sizeof(struct drm_color_lut),
+@@ -423,7 +390,7 @@ static int drm_atomic_crtc_set_property(struct drm_crtc *crtc,
+ 		state->color_mgmt_changed |= replaced;
+ 		return ret;
+ 	} else if (property == config->ctm_property) {
+-		ret = drm_atomic_replace_property_blob_from_id(dev,
++		ret = drm_property_replace_blob_from_id(dev,
+ 					&state->ctm,
+ 					val,
+ 					sizeof(struct drm_color_ctm), -1,
+@@ -431,7 +398,7 @@ static int drm_atomic_crtc_set_property(struct drm_crtc *crtc,
+ 		state->color_mgmt_changed |= replaced;
+ 		return ret;
+ 	} else if (property == config->gamma_lut_property) {
+-		ret = drm_atomic_replace_property_blob_from_id(dev,
++		ret = drm_property_replace_blob_from_id(dev,
+ 					&state->gamma_lut,
+ 					val,
+ 					-1, sizeof(struct drm_color_lut),
+@@ -563,7 +530,7 @@ static int drm_atomic_plane_set_property(struct drm_plane *plane,
+ 	} else if (property == plane->color_range_property) {
+ 		state->color_range = val;
+ 	} else if (property == config->prop_fb_damage_clips) {
+-		ret = drm_atomic_replace_property_blob_from_id(dev,
++		ret = drm_property_replace_blob_from_id(dev,
+ 					&state->fb_damage_clips,
+ 					val,
+ 					-1,
+@@ -729,7 +696,7 @@ static int drm_atomic_connector_set_property(struct drm_connector *connector,
+ 		if (state->link_status != DRM_LINK_STATUS_GOOD)
+ 			state->link_status = val;
+ 	} else if (property == config->hdr_output_metadata_property) {
+-		ret = drm_atomic_replace_property_blob_from_id(dev,
++		ret = drm_property_replace_blob_from_id(dev,
+ 				&state->hdr_output_metadata,
+ 				val,
+ 				sizeof(struct hdr_output_metadata), -1,
+diff --git a/drivers/gpu/drm/drm_property.c b/drivers/gpu/drm/drm_property.c
+index dfec479830e4..f72ef6493340 100644
+--- a/drivers/gpu/drm/drm_property.c
++++ b/drivers/gpu/drm/drm_property.c
+@@ -751,6 +751,55 @@ bool drm_property_replace_blob(struct drm_property_blob **blob,
+ }
+ EXPORT_SYMBOL(drm_property_replace_blob);
+ 
++/**
++ * drm_property_replace_blob_from_id - replace a blob property taking a reference
++ * @dev: DRM device
++ * @blob: a pointer to the member blob to be replaced
++ * @blob_id: the id of the new blob to replace with
++ * @expected_size: expected size of the blob property
++ * @expected_elem_size: expected size of an element in the blob property
++ * @replaced: if the blob was in fact replaced
++ *
++ * Look up the new blob from id, take its reference, check expected sizes of
++ * the blob and its element and replace the old blob by the new one. Advertise
++ * if the replacement operation was successful.
++ *
++ * Return: true if the blob was in fact replaced. -EINVAL if the new blob was
++ * not found or sizes don't match.
++ */
++int drm_property_replace_blob_from_id(struct drm_device *dev,
++					 struct drm_property_blob **blob,
++					 uint64_t blob_id,
++					 ssize_t expected_size,
++					 ssize_t expected_elem_size,
++					 bool *replaced)
++{
++	struct drm_property_blob *new_blob = NULL;
++
++	if (blob_id != 0) {
++		new_blob = drm_property_lookup_blob(dev, blob_id);
++		if (new_blob == NULL)
++			return -EINVAL;
++
++		if (expected_size > 0 &&
++		    new_blob->length != expected_size) {
++			drm_property_blob_put(new_blob);
++			return -EINVAL;
++		}
++		if (expected_elem_size > 0 &&
++		    new_blob->length % expected_elem_size != 0) {
++			drm_property_blob_put(new_blob);
++			return -EINVAL;
++		}
++	}
++
++	*replaced |= drm_property_replace_blob(blob, new_blob);
++	drm_property_blob_put(new_blob);
++
++	return 0;
++}
++EXPORT_SYMBOL(drm_property_replace_blob_from_id);
++
+ int drm_mode_getblob_ioctl(struct drm_device *dev,
+ 			   void *data, struct drm_file *file_priv)
+ {
+diff --git a/include/drm/drm_mode_object.h b/include/drm/drm_mode_object.h
+index 912f1e415685..08d7a7f0188f 100644
+--- a/include/drm/drm_mode_object.h
++++ b/include/drm/drm_mode_object.h
+@@ -60,7 +60,7 @@ struct drm_mode_object {
+ 	void (*free_cb)(struct kref *kref);
+ };
+ 
+-#define DRM_OBJECT_MAX_PROPERTY 24
++#define DRM_OBJECT_MAX_PROPERTY 64
+ /**
+  * struct drm_object_properties - property tracking for &drm_mode_object
+  */
+diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
+index 51291983ea44..52c3287da0da 100644
+--- a/include/drm/drm_plane.h
++++ b/include/drm/drm_plane.h
+@@ -237,6 +237,13 @@ struct drm_plane_state {
+ 
+ 	/** @state: backpointer to global drm_atomic_state */
+ 	struct drm_atomic_state *state;
++
++	/**
++	 * @color_mgmt_changed: Color management properties have changed. Used
++	 * by the atomic helpers and drivers to steer the atomic commit control
++	 * flow.
++	 */
++	bool color_mgmt_changed : 1;
+ };
+ 
+ static inline struct drm_rect
+diff --git a/include/drm/drm_property.h b/include/drm/drm_property.h
+index 65bc9710a470..082f29156b3e 100644
+--- a/include/drm/drm_property.h
++++ b/include/drm/drm_property.h
+@@ -279,6 +279,12 @@ struct drm_property_blob *drm_property_create_blob(struct drm_device *dev,
+ 						   const void *data);
+ struct drm_property_blob *drm_property_lookup_blob(struct drm_device *dev,
+ 						   uint32_t id);
++int drm_property_replace_blob_from_id(struct drm_device *dev,
++				      struct drm_property_blob **blob,
++				      uint64_t blob_id,
++				      ssize_t expected_size,
++				      ssize_t expected_elem_size,
++				      bool *replaced);
+ int drm_property_replace_global_blob(struct drm_device *dev,
+ 				     struct drm_property_blob **replace,
+ 				     size_t length,
+diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
+index 43691058d28f..23fc19400998 100644
+--- a/include/uapi/drm/drm_mode.h
++++ b/include/uapi/drm/drm_mode.h
+@@ -843,6 +843,14 @@ struct drm_color_ctm {
+ 	__u64 matrix[9];
+ };
+ 
++struct drm_color_ctm2 {
++	/*
++	 * Conversion matrix in S31.32 sign-magnitude
++	 * (not two's complement!) format.
++	 */
++	__u64 matrix[12];
++};
++
+ struct drm_color_lut {
+ 	/*
+ 	 * Values are mapped linearly to 0.0 - 1.0 range, with 0x0 == 0.0 and
+-- 
+2.42.0
+
+From f43591177032844d0dec73debda8218267d6d2ef Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Mon, 28 Aug 2023 14:01:19 +0200
+Subject: [PATCH 2/7] amd-pref-core
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
  .../admin-guide/kernel-parameters.txt         |   5 +
- Documentation/admin-guide/pm/amd-pstate.rst   |  54 +++++++
- arch/x86/Kconfig                              |   3 +-
+ Documentation/admin-guide/pm/amd-pstate.rst   |  53 ++++++
+ arch/x86/Kconfig                              |   5 +-
  drivers/acpi/cppc_acpi.c                      |  13 ++
  drivers/acpi/processor_driver.c               |   6 +
  drivers/cpufreq/amd-pstate-ut.c               |  50 +++---
@@ -16,10 +2148,10 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  include/acpi/cppc_acpi.h                      |   5 +
  include/linux/amd-pstate.h                    |   1 +
  include/linux/cpufreq.h                       |   4 +
- 11 files changed, 259 insertions(+), 47 deletions(-)
+ 11 files changed, 259 insertions(+), 48 deletions(-)
 
 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
-index 722b6eca2e938..ac95d4c9666e4 100644
+index 722b6eca2e93..ac95d4c9666e 100644
 --- a/Documentation/admin-guide/kernel-parameters.txt
 +++ b/Documentation/admin-guide/kernel-parameters.txt
 @@ -363,6 +363,11 @@
@@ -35,10 +2167,10 @@ index 722b6eca2e938..ac95d4c9666e4 100644
  			Map of devices attached to JOY0DAT and JOY1DAT
  			Format: <a>,<b>
 diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
-index 1cf40f69278cd..ef2b69935311f 100644
+index 1cf40f69278c..2369b58a3521 100644
 --- a/Documentation/admin-guide/pm/amd-pstate.rst
 +++ b/Documentation/admin-guide/pm/amd-pstate.rst
-@@ -353,6 +353,48 @@ is activated.  In this mode, driver requests minimum and maximum performance
+@@ -353,6 +353,47 @@ is activated.  In this mode, driver requests minimum and maximum performance
  level and the platform autonomously selects a performance level in this range
  and appropriate to the current workload.
  
@@ -80,14 +2212,13 @@ index 1cf40f69278cd..ef2b69935311f 100644
 +
 +``amd_prefcore=disable``
 +
-+If ``amd_prefcore=disable`` is passed to kernel command line option
-+then disable ``AMD Pstate Preferred Core`` if platform can support
-+the Preferred Core feature.
++``AMD Pstate Preferred Core`` will be enabled if the underlying platform
++supports it. It can be disabled by kernerl parameter: ``amd_prefcore=disable``.
 +
  User Space Interface in ``sysfs`` - General
  ===========================================
  
-@@ -385,6 +427,18 @@ control its functionality at the system level.  They are located in the
+@@ -385,6 +426,18 @@ control its functionality at the system level.  They are located in the
          to the operation mode represented by that string - or to be
          unregistered in the "disable" case.
  
@@ -107,7 +2238,7 @@ index 1cf40f69278cd..ef2b69935311f 100644
  ===============================================
  
 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
-index e36261b4ea14f..03322d2840faa 100644
+index e36261b4ea14..16df141bd8a2 100644
 --- a/arch/x86/Kconfig
 +++ b/arch/x86/Kconfig
 @@ -1052,8 +1052,9 @@ config SCHED_MC
@@ -115,14 +2246,15 @@ index e36261b4ea14f..03322d2840faa 100644
  config SCHED_MC_PRIO
  	bool "CPU core priorities scheduler support"
 -	depends on SCHED_MC && CPU_SUP_INTEL
+-	select X86_INTEL_PSTATE
 +	depends on SCHED_MC
- 	select X86_INTEL_PSTATE
-+	select X86_AMD_PSTATE
++	select X86_INTEL_PSTATE if CPU_SUP_INTEL
++	select X86_AMD_PSTATE if CPU_SUP_AMD
  	select CPU_FREQ
  	default y
  	help
 diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c
-index 7ff269a78c208..ad388a0e84842 100644
+index 7ff269a78c20..ad388a0e8484 100644
 --- a/drivers/acpi/cppc_acpi.c
 +++ b/drivers/acpi/cppc_acpi.c
 @@ -1154,6 +1154,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
@@ -146,7 +2278,7 @@ index 7ff269a78c208..ad388a0e84842 100644
   * cppc_get_epp_perf - Get the epp register value.
   * @cpunum: CPU from which to get epp preference value.
 diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
-index 4bd16b3f07814..29b2fb68a35db 100644
+index 4bd16b3f0781..29b2fb68a35d 100644
 --- a/drivers/acpi/processor_driver.c
 +++ b/drivers/acpi/processor_driver.c
 @@ -27,6 +27,7 @@
@@ -170,7 +2302,7 @@ index 4bd16b3f07814..29b2fb68a35db 100644
  		acpi_handle_debug(handle, "Unsupported event [0x%x]\n", event);
  		break;
 diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c
-index 7f3fe20489818..f04ae67dda372 100644
+index 7f3fe2048981..f04ae67dda37 100644
 --- a/drivers/cpufreq/amd-pstate-ut.c
 +++ b/drivers/cpufreq/amd-pstate-ut.c
 @@ -64,27 +64,9 @@ static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = {
@@ -303,7 +2435,7 @@ index 7f3fe20489818..f04ae67dda372 100644
  
  static int __init amd_pstate_ut_init(void)
 diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
-index 9a1e194d5cf88..8a8e4ecb1b5c6 100644
+index 9a1e194d5cf8..8a8e4ecb1b5c 100644
 --- a/drivers/cpufreq/amd-pstate.c
 +++ b/drivers/cpufreq/amd-pstate.c
 @@ -37,6 +37,7 @@
@@ -572,7 +2704,7 @@ index 9a1e194d5cf88..8a8e4ecb1b5c6 100644
  MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>");
  MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver");
 diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
-index 50bbc969ffe53..842357abfae60 100644
+index 50bbc969ffe5..842357abfae6 100644
 --- a/drivers/cpufreq/cpufreq.c
 +++ b/drivers/cpufreq/cpufreq.c
 @@ -2675,6 +2675,19 @@ void cpufreq_update_limits(unsigned int cpu)
@@ -596,7 +2728,7 @@ index 50bbc969ffe53..842357abfae60 100644
   *               BOOST						     *
   *********************************************************************/
 diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
-index 6126c977ece04..c0b69ffe7bdb4 100644
+index 6126c977ece0..c0b69ffe7bdb 100644
 --- a/include/acpi/cppc_acpi.h
 +++ b/include/acpi/cppc_acpi.h
 @@ -139,6 +139,7 @@ struct cppc_cpudata {
@@ -619,7 +2751,7 @@ index 6126c977ece04..c0b69ffe7bdb4 100644
  {
  	return -ENOTSUPP;
 diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
-index 446394f846064..fa86bc953d3e0 100644
+index 446394f84606..fa86bc953d3e 100644
 --- a/include/linux/amd-pstate.h
 +++ b/include/linux/amd-pstate.h
 @@ -70,6 +70,7 @@ struct amd_cpudata {
@@ -631,7 +2763,7 @@ index 446394f846064..fa86bc953d3e0 100644
  	u32	max_freq;
  	u32	min_freq;
 diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
-index 172ff51c1b2a4..766c83a4fae74 100644
+index 172ff51c1b2a..766c83a4fae7 100644
 --- a/include/linux/cpufreq.h
 +++ b/include/linux/cpufreq.h
 @@ -231,6 +231,7 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
@@ -653,12 +2785,12 @@ index 172ff51c1b2a4..766c83a4fae74 100644
  	int		(*bios_limit)(int cpu, unsigned int *limit);
  
 -- 
-2.41.0
+2.42.0
 
-From 85c40edbbd82439d1ca1e367eed47ad58119a341 Mon Sep 17 00:00:00 2001
+From b35ba9f5a6ca4ac70053f1120b2042daa320ea59 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Sun, 13 Aug 2023 22:53:18 +0200
-Subject: [PATCH 2/6] bbr3
+Subject: [PATCH 3/7] bbr3
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -680,7 +2812,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  15 files changed, 1934 insertions(+), 551 deletions(-)
 
 diff --git a/include/linux/tcp.h b/include/linux/tcp.h
-index 91a37c99ba665..ae0ee688c3f7b 100644
+index 91a37c99ba66..ae0ee688c3f7 100644
 --- a/include/linux/tcp.h
 +++ b/include/linux/tcp.h
 @@ -255,7 +255,9 @@ struct tcp_sock {
@@ -695,7 +2827,7 @@ index 91a37c99ba665..ae0ee688c3f7b 100644
  	u32	chrono_stat[3];	/* Time in jiffies for chrono_stat stats */
  	u8	chrono_type:2,	/* current chronograph type */
 diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
-index c2b15f7e55161..a400a84088d38 100644
+index c2b15f7e5516..a400a84088d3 100644
 --- a/include/net/inet_connection_sock.h
 +++ b/include/net/inet_connection_sock.h
 @@ -135,8 +135,8 @@ struct inet_connection_sock {
@@ -710,7 +2842,7 @@ index c2b15f7e55161..a400a84088d38 100644
  
  #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
 diff --git a/include/net/tcp.h b/include/net/tcp.h
-index 0ca972ebd3dd0..8eb194559b701 100644
+index 0ca972ebd3dd..8eb194559b70 100644
 --- a/include/net/tcp.h
 +++ b/include/net/tcp.h
 @@ -370,6 +370,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk,
@@ -882,7 +3014,7 @@ index 0ca972ebd3dd0..8eb194559b701 100644
  static inline void tcp_plb_init(const struct sock *sk,
  				struct tcp_plb_state *plb)
 diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h
-index 50655de04c9b6..82f8bd8f0d161 100644
+index 50655de04c9b..82f8bd8f0d16 100644
 --- a/include/uapi/linux/inet_diag.h
 +++ b/include/uapi/linux/inet_diag.h
 @@ -229,6 +229,29 @@ struct tcp_bbr_info {
@@ -916,7 +3048,7 @@ index 50655de04c9b6..82f8bd8f0d161 100644
  
  union tcp_cc_info {
 diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
-index 51c13cf9c5aee..de8dcba26becc 100644
+index 51c13cf9c5ae..de8dcba26bec 100644
 --- a/include/uapi/linux/rtnetlink.h
 +++ b/include/uapi/linux/rtnetlink.h
 @@ -506,9 +506,11 @@ enum {
@@ -933,7 +3065,7 @@ index 51c13cf9c5aee..de8dcba26becc 100644
  struct rta_session {
  	__u8	proto;
 diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
-index 879eeb0a084b4..77270053a5e39 100644
+index 879eeb0a084b..77270053a5e3 100644
 --- a/include/uapi/linux/tcp.h
 +++ b/include/uapi/linux/tcp.h
 @@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail {
@@ -945,7 +3077,7 @@ index 879eeb0a084b4..77270053a5e39 100644
  /*
   * Sender's congestion state indicating normal or abnormal situations
 diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
-index 2dfb12230f089..2e14db3bee704 100644
+index 2dfb12230f08..2e14db3bee70 100644
 --- a/net/ipv4/Kconfig
 +++ b/net/ipv4/Kconfig
 @@ -668,15 +668,18 @@ config TCP_CONG_BBR
@@ -977,7 +3109,7 @@ index 2dfb12230f089..2e14db3bee704 100644
  choice
  	prompt "Default TCP congestion control"
 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
-index 8ed52e1e3c99a..0198ac17f3a8f 100644
+index 8ed52e1e3c99..0198ac17f3a8 100644
 --- a/net/ipv4/tcp.c
 +++ b/net/ipv4/tcp.c
 @@ -3083,6 +3083,7 @@ int tcp_disconnect(struct sock *sk, int flags)
@@ -998,7 +3130,7 @@ index 8ed52e1e3c99a..0198ac17f3a8f 100644
  		info->tcpi_options |= TCPI_OPT_SYN_DATA;
  
 diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
-index 146792cd26fed..f4f477a69917d 100644
+index 146792cd26fe..f4f477a69917 100644
 --- a/net/ipv4/tcp_bbr.c
 +++ b/net/ipv4/tcp_bbr.c
 @@ -1,18 +1,19 @@
@@ -3643,7 +5775,7 @@ index 146792cd26fed..f4f477a69917d 100644
  MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");
 +MODULE_VERSION(__stringify(BBR_VERSION));
 diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
-index 1b34050a7538b..66d40449b3f4f 100644
+index 1b34050a7538..66d40449b3f4 100644
 --- a/net/ipv4/tcp_cong.c
 +++ b/net/ipv4/tcp_cong.c
 @@ -241,6 +241,7 @@ void tcp_init_congestion_control(struct sock *sk)
@@ -3655,7 +5787,7 @@ index 1b34050a7538b..66d40449b3f4f 100644
  		icsk->icsk_ca_ops->init(sk);
  	if (tcp_ca_needs_ecn(sk))
 diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
-index 57c8af1859c16..2195ba488142a 100644
+index 57c8af1859c1..2195ba488142 100644
 --- a/net/ipv4/tcp_input.c
 +++ b/net/ipv4/tcp_input.c
 @@ -349,7 +349,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
@@ -3789,7 +5921,7 @@ index 57c8af1859c16..2195ba488142a 100644
  	    tcp_in_quickack_mode(sk) ||
  	    /* Protocol state mandates a one-time immediate ACK */
 diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
-index c8f2aa0033871..fdf51e436899f 100644
+index c8f2aa003387..fdf51e436899 100644
 --- a/net/ipv4/tcp_minisocks.c
 +++ b/net/ipv4/tcp_minisocks.c
 @@ -440,6 +440,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
@@ -3802,7 +5934,7 @@ index c8f2aa0033871..fdf51e436899f 100644
  		const struct tcp_congestion_ops *ca;
  
 diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
-index 51d8638d4b4c6..2fb064057868a 100644
+index 51d8638d4b4c..2fb064057868 100644
 --- a/net/ipv4/tcp_output.c
 +++ b/net/ipv4/tcp_output.c
 @@ -325,10 +325,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
@@ -3913,7 +6045,7 @@ index 51d8638d4b4c6..2fb064057868a 100644
  		goto rearm_timer;
  
 diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
-index a8f6d9d06f2eb..8737f21346481 100644
+index a8f6d9d06f2e..8737f2134648 100644
 --- a/net/ipv4/tcp_rate.c
 +++ b/net/ipv4/tcp_rate.c
 @@ -34,6 +34,24 @@
@@ -3993,7 +6125,7 @@ index a8f6d9d06f2eb..8737f21346481 100644
  	rs->interval_us = max(snd_us, ack_us);
  
 diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
-index 206418b6d7c48..619069963ff07 100644
+index 206418b6d7c4..619069963ff0 100644
 --- a/net/ipv4/tcp_timer.c
 +++ b/net/ipv4/tcp_timer.c
 @@ -626,6 +626,7 @@ void tcp_write_timer_handler(struct sock *sk)
@@ -4005,12 +6137,12 @@ index 206418b6d7c48..619069963ff07 100644
  	event = icsk->icsk_pending;
  
 -- 
-2.41.0
+2.42.0
 
-From 7f942a85c0cc0c584314cee751f793e8a7dc93ba Mon Sep 17 00:00:00 2001
+From 41db757e2b0e00035bdd9692a6b5d143eac1d33e Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Sun, 20 Aug 2023 15:54:59 +0200
-Subject: [PATCH 3/6] cachy
+Date: Mon, 28 Aug 2023 14:01:56 +0200
+Subject: [PATCH 4/7] cachy
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -4071,7 +6203,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  create mode 100644 drivers/platform/x86/steamdeck.c
 
 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
-index ac95d4c9666e4..b3eecf5b94f40 100644
+index ac95d4c9666e..b3eecf5b94f4 100644
 --- a/Documentation/admin-guide/kernel-parameters.txt
 +++ b/Documentation/admin-guide/kernel-parameters.txt
 @@ -4276,6 +4276,15 @@
@@ -4091,7 +6223,7 @@ index ac95d4c9666e4..b3eecf5b94f40 100644
  				Safety option to keep boot IRQs enabled. This
  				should never be necessary.
 diff --git a/Makefile b/Makefile
-index 4739c21a63e2e..daf528173b398 100644
+index 2fdd8b40b7e0..8a601d85cd3f 100644
 --- a/Makefile
 +++ b/Makefile
 @@ -831,6 +831,9 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning, address-of-packed-member)
@@ -4117,7 +6249,7 @@ index 4739c21a63e2e..daf528173b398 100644
  KBUILD_CFLAGS   += -Werror=date-time
  
 diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig
-index 81764160451f7..2c15d3bf747a9 100644
+index 81764160451f..2c15d3bf747a 100644
 --- a/arch/arc/configs/axs101_defconfig
 +++ b/arch/arc/configs/axs101_defconfig
 @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
@@ -4129,7 +6261,7 @@ index 81764160451f7..2c15d3bf747a9 100644
  CONFIG_PERF_EVENTS=y
  # CONFIG_VM_EVENT_COUNTERS is not set
 diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig
-index d5181275490ed..7d868e148d9a4 100644
+index d5181275490e..7d868e148d9a 100644
 --- a/arch/arc/configs/axs103_defconfig
 +++ b/arch/arc/configs/axs103_defconfig
 @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
@@ -4141,7 +6273,7 @@ index d5181275490ed..7d868e148d9a4 100644
  CONFIG_PERF_EVENTS=y
  # CONFIG_VM_EVENT_COUNTERS is not set
 diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig
-index 07c89281c2e3a..1513324ddb008 100644
+index 07c89281c2e3..1513324ddb00 100644
 --- a/arch/arc/configs/axs103_smp_defconfig
 +++ b/arch/arc/configs/axs103_smp_defconfig
 @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
@@ -4153,7 +6285,7 @@ index 07c89281c2e3a..1513324ddb008 100644
  CONFIG_PERF_EVENTS=y
  # CONFIG_VM_EVENT_COUNTERS is not set
 diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig
-index 8c3ed5d6e6c35..2db643853e8f4 100644
+index 8c3ed5d6e6c3..2db643853e8f 100644
 --- a/arch/arc/configs/haps_hs_defconfig
 +++ b/arch/arc/configs/haps_hs_defconfig
 @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
@@ -4165,7 +6297,7 @@ index 8c3ed5d6e6c35..2db643853e8f4 100644
  CONFIG_PERF_EVENTS=y
  # CONFIG_COMPAT_BRK is not set
 diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig
-index 61107e8bac336..d764007e5adad 100644
+index 61107e8bac33..d764007e5ada 100644
 --- a/arch/arc/configs/haps_hs_smp_defconfig
 +++ b/arch/arc/configs/haps_hs_smp_defconfig
 @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
@@ -4177,7 +6309,7 @@ index 61107e8bac336..d764007e5adad 100644
  CONFIG_PERF_EVENTS=y
  # CONFIG_VM_EVENT_COUNTERS is not set
 diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig
-index 4ee2a1507b57f..ce6a4431a76dd 100644
+index 4ee2a1507b57..ce6a4431a76d 100644
 --- a/arch/arc/configs/hsdk_defconfig
 +++ b/arch/arc/configs/hsdk_defconfig
 @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y
@@ -4189,7 +6321,7 @@ index 4ee2a1507b57f..ce6a4431a76dd 100644
  CONFIG_PERF_EVENTS=y
  # CONFIG_VM_EVENT_COUNTERS is not set
 diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig
-index 3e98297759925..5044609540cc3 100644
+index 3e9829775992..5044609540cc 100644
 --- a/arch/arc/configs/nsim_700_defconfig
 +++ b/arch/arc/configs/nsim_700_defconfig
 @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y
@@ -4201,7 +6333,7 @@ index 3e98297759925..5044609540cc3 100644
  CONFIG_EMBEDDED=y
  CONFIG_PERF_EVENTS=y
 diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig
-index 502c87f351c87..748c809d1c4c6 100644
+index 502c87f351c8..748c809d1c4c 100644
 --- a/arch/arc/configs/nsimosci_defconfig
 +++ b/arch/arc/configs/nsimosci_defconfig
 @@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y
@@ -4213,7 +6345,7 @@ index 502c87f351c87..748c809d1c4c6 100644
  CONFIG_EMBEDDED=y
  CONFIG_PERF_EVENTS=y
 diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig
-index f721cc3997d02..205c32b0074ca 100644
+index f721cc3997d0..205c32b0074c 100644
 --- a/arch/arc/configs/nsimosci_hs_defconfig
 +++ b/arch/arc/configs/nsimosci_hs_defconfig
 @@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y
@@ -4225,7 +6357,7 @@ index f721cc3997d02..205c32b0074ca 100644
  CONFIG_EMBEDDED=y
  CONFIG_PERF_EVENTS=y
 diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig
-index 1419fc946a083..2477b7c809771 100644
+index 1419fc946a08..2477b7c80977 100644
 --- a/arch/arc/configs/nsimosci_hs_smp_defconfig
 +++ b/arch/arc/configs/nsimosci_hs_smp_defconfig
 @@ -8,6 +8,7 @@ CONFIG_IKCONFIG_PROC=y
@@ -4237,7 +6369,7 @@ index 1419fc946a083..2477b7c809771 100644
  # CONFIG_COMPAT_BRK is not set
  CONFIG_KPROBES=y
 diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig
-index 941bbadd6bf2c..e61132ba4f890 100644
+index 941bbadd6bf2..e61132ba4f89 100644
 --- a/arch/arc/configs/tb10x_defconfig
 +++ b/arch/arc/configs/tb10x_defconfig
 @@ -14,6 +14,7 @@ CONFIG_INITRAMFS_SOURCE="../tb10x-rootfs.cpio"
@@ -4249,7 +6381,7 @@ index 941bbadd6bf2c..e61132ba4f890 100644
  # CONFIG_AIO is not set
  CONFIG_EMBEDDED=y
 diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig
-index d3ef189c75f8b..922b1b24f5184 100644
+index d3ef189c75f8..922b1b24f518 100644
 --- a/arch/arc/configs/vdk_hs38_defconfig
 +++ b/arch/arc/configs/vdk_hs38_defconfig
 @@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y
@@ -4261,7 +6393,7 @@ index d3ef189c75f8b..922b1b24f5184 100644
  CONFIG_PERF_EVENTS=y
  # CONFIG_VM_EVENT_COUNTERS is not set
 diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig
-index 944b347025fd1..ed64319f7eb29 100644
+index 944b347025fd..ed64319f7eb2 100644
 --- a/arch/arc/configs/vdk_hs38_smp_defconfig
 +++ b/arch/arc/configs/vdk_hs38_smp_defconfig
 @@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y
@@ -4273,7 +6405,7 @@ index 944b347025fd1..ed64319f7eb29 100644
  CONFIG_PERF_EVENTS=y
  # CONFIG_VM_EVENT_COUNTERS is not set
 diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
-index 00468adf180f1..46cc91cb622fc 100644
+index 00468adf180f..46cc91cb622f 100644
 --- a/arch/x86/Kconfig.cpu
 +++ b/arch/x86/Kconfig.cpu
 @@ -157,7 +157,7 @@ config MPENTIUM4
@@ -4806,7 +6938,7 @@ index 00468adf180f1..46cc91cb622fc 100644
  config IA32_FEAT_CTL
  	def_bool y
 diff --git a/arch/x86/Makefile b/arch/x86/Makefile
-index fdc2e3abd6152..63845db8bf8a5 100644
+index fdc2e3abd615..63845db8bf8a 100644
 --- a/arch/x86/Makefile
 +++ b/arch/x86/Makefile
 @@ -67,7 +67,7 @@ export BITS
@@ -4870,7 +7002,7 @@ index fdc2e3abd6152..63845db8bf8a5 100644
          KBUILD_CFLAGS += $(cflags-y)
  
 diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
-index b40c462b4af36..c4e66e60d559d 100644
+index b40c462b4af3..c4e66e60d559 100644
 --- a/arch/x86/include/asm/pci.h
 +++ b/arch/x86/include/asm/pci.h
 @@ -27,6 +27,7 @@ struct pci_sysdata {
@@ -4894,7 +7026,7 @@ index b40c462b4af36..c4e66e60d559d 100644
     already-configured bus numbers - to be used for buggy BIOSes
     or architectures with incomplete PCI setup by the loader */
 diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h
-index 75884d2cdec37..02c1386eb653e 100644
+index 75884d2cdec3..02c1386eb653 100644
 --- a/arch/x86/include/asm/vermagic.h
 +++ b/arch/x86/include/asm/vermagic.h
 @@ -17,6 +17,54 @@
@@ -4986,7 +7118,7 @@ index 75884d2cdec37..02c1386eb653e 100644
  #define MODULE_PROC_FAMILY "ELAN "
  #elif defined CONFIG_MCRUSOE
 diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
-index ddb798603201e..7c20387d82029 100644
+index ddb798603201..7c20387d8202 100644
 --- a/arch/x86/pci/common.c
 +++ b/arch/x86/pci/common.c
 @@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void)
@@ -5008,7 +7140,7 @@ index ddb798603201e..7c20387d82029 100644
  }
 -#endif
 diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
-index 3cce6de464a7b..9176bc4f07daa 100644
+index 3cce6de464a7..9176bc4f07da 100644
 --- a/block/bfq-iosched.c
 +++ b/block/bfq-iosched.c
 @@ -7627,6 +7627,7 @@ MODULE_ALIAS("bfq-iosched");
@@ -5032,7 +7164,7 @@ index 3cce6de464a7b..9176bc4f07daa 100644
  
  slab_kill:
 diff --git a/drivers/Makefile b/drivers/Makefile
-index 7241d80a7b293..ac0ca3498f43e 100644
+index 7241d80a7b29..ac0ca3498f43 100644
 --- a/drivers/Makefile
 +++ b/drivers/Makefile
 @@ -64,15 +64,8 @@ obj-y				+= char/
@@ -5067,7 +7199,7 @@ index 7241d80a7b293..ac0ca3498f43e 100644
  obj-$(CONFIG_MTD)		+= mtd/
  obj-$(CONFIG_SPI)		+= spi/
 diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
-index addba109406be..f819ee132ffa2 100644
+index addba109406b..f819ee132ffa 100644
 --- a/drivers/ata/ahci.c
 +++ b/drivers/ata/ahci.c
 @@ -1522,7 +1522,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance)
@@ -5123,7 +7255,7 @@ index addba109406be..f819ee132ffa2 100644
  	sysfs_add_file_to_group(&pdev->dev.kobj,
  				&dev_attr_remapped_nvme.attr,
 diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
-index 438c9e75a04dc..1bbfeca5f01ec 100644
+index 438c9e75a04d..1bbfeca5f01e 100644
 --- a/drivers/cpufreq/Kconfig.x86
 +++ b/drivers/cpufreq/Kconfig.x86
 @@ -9,7 +9,6 @@ config X86_INTEL_PSTATE
@@ -5143,7 +7275,7 @@ index 438c9e75a04dc..1bbfeca5f01ec 100644
  	  This driver adds a CPUFreq driver which utilizes a fine grain
  	  processor performance frequency control range instead of legacy
 diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
-index 9cfe8fc509d7d..efc3b0c0b4adb 100644
+index 9cfe8fc509d7..efc3b0c0b4ad 100644
 --- a/drivers/i2c/busses/Kconfig
 +++ b/drivers/i2c/busses/Kconfig
 @@ -229,6 +229,15 @@ config I2C_CHT_WC
@@ -5163,7 +7295,7 @@ index 9cfe8fc509d7d..efc3b0c0b4adb 100644
  	tristate "Nvidia nForce2, nForce3 and nForce4"
  	depends on PCI
 diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
-index af56fe2c75c09..76be74584719e 100644
+index af56fe2c75c0..76be74584719 100644
 --- a/drivers/i2c/busses/Makefile
 +++ b/drivers/i2c/busses/Makefile
 @@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC)	+= i2c-cht-wc.o
@@ -5176,7 +7308,7 @@ index af56fe2c75c09..76be74584719e 100644
  obj-$(CONFIG_I2C_NVIDIA_GPU)	+= i2c-nvidia-gpu.o
 diff --git a/drivers/i2c/busses/i2c-nct6775.c b/drivers/i2c/busses/i2c-nct6775.c
 new file mode 100644
-index 0000000000000..0462f09520431
+index 000000000000..0462f0952043
 --- /dev/null
 +++ b/drivers/i2c/busses/i2c-nct6775.c
 @@ -0,0 +1,647 @@
@@ -5828,7 +7960,7 @@ index 0000000000000..0462f09520431
 +module_init(i2c_nct6775_init);
 +module_exit(i2c_nct6775_exit);
 diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
-index 809fbd014cd68..d54b35b147ee9 100644
+index 809fbd014cd6..d54b35b147ee 100644
 --- a/drivers/i2c/busses/i2c-piix4.c
 +++ b/drivers/i2c/busses/i2c-piix4.c
 @@ -568,11 +568,11 @@ static int piix4_transaction(struct i2c_adapter *piix4_adapter)
@@ -5846,7 +7978,7 @@ index 809fbd014cd68..d54b35b147ee9 100644
  	/* If the SMBus is still busy, we give up */
  	if (timeout == MAX_TIMEOUT) {
 diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
-index 1dc6227d353ec..bab1009ccef79 100644
+index 1dc6227d353e..bab1009ccef7 100644
 --- a/drivers/md/dm-crypt.c
 +++ b/drivers/md/dm-crypt.c
 @@ -3240,6 +3240,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
@@ -5862,7 +7994,7 @@ index 1dc6227d353ec..bab1009ccef79 100644
  	if (ret < 0)
  		goto bad;
 diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile
-index 37c8663de7fe1..897d19f92edeb 100644
+index 37c8663de7fe..897d19f92ede 100644
 --- a/drivers/pci/controller/Makefile
 +++ b/drivers/pci/controller/Makefile
 @@ -1,4 +1,10 @@
@@ -5878,7 +8010,7 @@ index 37c8663de7fe1..897d19f92edeb 100644
  obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o
 diff --git a/drivers/pci/controller/intel-nvme-remap.c b/drivers/pci/controller/intel-nvme-remap.c
 new file mode 100644
-index 0000000000000..e105e6f5cc91d
+index 000000000000..e105e6f5cc91
 --- /dev/null
 +++ b/drivers/pci/controller/intel-nvme-remap.c
 @@ -0,0 +1,462 @@
@@ -6345,7 +8477,7 @@ index 0000000000000..e105e6f5cc91d
 +MODULE_AUTHOR("Daniel Drake <drake@endlessm.com>");
 +MODULE_LICENSE("GPL v2");
 diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
-index 321156ca273d5..5dda26c737e2c 100644
+index 321156ca273d..5dda26c737e2 100644
 --- a/drivers/pci/quirks.c
 +++ b/drivers/pci/quirks.c
 @@ -3718,6 +3718,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev)
@@ -6464,7 +8596,7 @@ index 321156ca273d5..5dda26c737e2c 100644
  };
  
 diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
-index 49c2c4cd8d000..956f4eff85b5b 100644
+index 49c2c4cd8d00..956f4eff85b5 100644
 --- a/drivers/platform/x86/Kconfig
 +++ b/drivers/platform/x86/Kconfig
 @@ -643,6 +643,16 @@ config THINKPAD_LMI
@@ -6506,7 +8638,7 @@ index 49c2c4cd8d000..956f4eff85b5b 100644
  
  config P2SB
 diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
-index 52dfdf574ac2d..d32b6d87219ff 100644
+index 52dfdf574ac2..d32b6d87219f 100644
 --- a/drivers/platform/x86/Makefile
 +++ b/drivers/platform/x86/Makefile
 @@ -66,6 +66,7 @@ obj-$(CONFIG_SENSORS_HDAPS)	+= hdaps.o
@@ -6526,7 +8658,7 @@ index 52dfdf574ac2d..d32b6d87219ff 100644
 +obj-$(CONFIG_STEAMDECK)			+= steamdeck.o
 diff --git a/drivers/platform/x86/legion-laptop.c b/drivers/platform/x86/legion-laptop.c
 new file mode 100644
-index 0000000000000..d1268d239cc5f
+index 000000000000..d1268d239cc5
 --- /dev/null
 +++ b/drivers/platform/x86/legion-laptop.c
 @@ -0,0 +1,2783 @@
@@ -9315,7 +11447,7 @@ index 0000000000000..d1268d239cc5f
 +module_exit(legion_exit);
 diff --git a/drivers/platform/x86/steamdeck.c b/drivers/platform/x86/steamdeck.c
 new file mode 100644
-index 0000000000000..77a6677ec19e6
+index 000000000000..77a6677ec19e
 --- /dev/null
 +++ b/drivers/platform/x86/steamdeck.c
 @@ -0,0 +1,523 @@
@@ -9843,7 +11975,7 @@ index 0000000000000..77a6677ec19e6
 +MODULE_DESCRIPTION("Steam Deck ACPI platform driver");
 +MODULE_LICENSE("GPL");
 diff --git a/include/linux/mm.h b/include/linux/mm.h
-index 406ab9ea818fe..17794c2130550 100644
+index 34f9dba17c1a..4527f319019a 100644
 --- a/include/linux/mm.h
 +++ b/include/linux/mm.h
 @@ -191,7 +191,7 @@ static inline void __mm_zero_struct_page(struct page *page)
@@ -9856,7 +11988,7 @@ index 406ab9ea818fe..17794c2130550 100644
  extern int sysctl_max_map_count;
  
 diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
-index 716953ee1ebdb..dace360dc38d7 100644
+index 716953ee1ebd..dace360dc38d 100644
 --- a/include/linux/pagemap.h
 +++ b/include/linux/pagemap.h
 @@ -1181,7 +1181,7 @@ struct readahead_control {
@@ -9869,7 +12001,7 @@ index 716953ee1ebdb..dace360dc38d7 100644
  void page_cache_ra_unbounded(struct readahead_control *,
  		unsigned long nr_to_read, unsigned long lookahead_count);
 diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
-index 45f09bec02c48..87b20e2ee2744 100644
+index 45f09bec02c4..87b20e2ee274 100644
 --- a/include/linux/user_namespace.h
 +++ b/include/linux/user_namespace.h
 @@ -148,6 +148,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns,
@@ -9891,7 +12023,7 @@ index 45f09bec02c48..87b20e2ee2744 100644
  {
  	return &init_user_ns;
 diff --git a/init/Kconfig b/init/Kconfig
-index f7f65af4ee129..71755cc8ed3e4 100644
+index f7f65af4ee12..71755cc8ed3e 100644
 --- a/init/Kconfig
 +++ b/init/Kconfig
 @@ -123,6 +123,10 @@ config THREAD_INFO_IN_TASK
@@ -9942,7 +12074,7 @@ index f7f65af4ee129..71755cc8ed3e4 100644
  	bool "Optimize for size (-Os)"
  	help
 diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
-index 38ef6d06888ef..0f78364efd4f2 100644
+index 38ef6d06888e..0f78364efd4f 100644
 --- a/kernel/Kconfig.hz
 +++ b/kernel/Kconfig.hz
 @@ -40,6 +40,27 @@ choice
@@ -9984,7 +12116,7 @@ index 38ef6d06888ef..0f78364efd4f2 100644
  
  config SCHED_HRTICK
 diff --git a/kernel/fork.c b/kernel/fork.c
-index d2e12b6d2b180..95ca80492a379 100644
+index d2e12b6d2b18..95ca80492a37 100644
 --- a/kernel/fork.c
 +++ b/kernel/fork.c
 @@ -100,6 +100,10 @@
@@ -10023,7 +12155,7 @@ index d2e12b6d2b180..95ca80492a379 100644
  	if (err)
  		goto bad_unshare_out;
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index b3e25be58e2b7..2c335df301718 100644
+index b3e25be58e2b..2c335df30171 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
 @@ -69,9 +69,13 @@
@@ -10083,7 +12215,7 @@ index b3e25be58e2b7..2c335df301718 100644
  #ifdef CONFIG_NUMA_BALANCING
  /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
 diff --git a/kernel/sysctl.c b/kernel/sysctl.c
-index 354a2d294f526..4dc780aa3bcc8 100644
+index 354a2d294f52..4dc780aa3bcc 100644
 --- a/kernel/sysctl.c
 +++ b/kernel/sysctl.c
 @@ -95,6 +95,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals);
@@ -10113,7 +12245,7 @@ index 354a2d294f526..4dc780aa3bcc8 100644
  	{
  		.procname	= "tainted",
 diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
-index 1d8e47bed3f11..fec01d016a351 100644
+index 1d8e47bed3f1..fec01d016a35 100644
 --- a/kernel/user_namespace.c
 +++ b/kernel/user_namespace.c
 @@ -22,6 +22,13 @@
@@ -10131,7 +12263,7 @@ index 1d8e47bed3f11..fec01d016a351 100644
  static DEFINE_MUTEX(userns_state_mutex);
  
 diff --git a/mm/Kconfig b/mm/Kconfig
-index 09130434e30d3..f772ba88df878 100644
+index 09130434e30d..f772ba88df87 100644
 --- a/mm/Kconfig
 +++ b/mm/Kconfig
 @@ -631,7 +631,7 @@ config COMPACTION
@@ -10144,7 +12276,7 @@ index 09130434e30d3..f772ba88df878 100644
  
  #
 diff --git a/mm/page-writeback.c b/mm/page-writeback.c
-index d3f42009bb702..39b9fd0606304 100644
+index d3f42009bb70..39b9fd060630 100644
 --- a/mm/page-writeback.c
 +++ b/mm/page-writeback.c
 @@ -71,7 +71,11 @@ static long ratelimit_pages = 32;
@@ -10172,7 +12304,7 @@ index d3f42009bb702..39b9fd0606304 100644
  EXPORT_SYMBOL_GPL(dirty_writeback_interval);
  
 diff --git a/mm/swap.c b/mm/swap.c
-index cd8f0150ba3aa..42c405a4f114c 100644
+index cd8f0150ba3a..42c405a4f114 100644
 --- a/mm/swap.c
 +++ b/mm/swap.c
 @@ -1090,6 +1090,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
@@ -10193,7 +12325,7 @@ index cd8f0150ba3aa..42c405a4f114c 100644
 +#endif
  }
 diff --git a/mm/vmpressure.c b/mm/vmpressure.c
-index b52644771cc43..11a4b0e3b583c 100644
+index b52644771cc4..11a4b0e3b583 100644
 --- a/mm/vmpressure.c
 +++ b/mm/vmpressure.c
 @@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
@@ -10209,7 +12341,7 @@ index b52644771cc43..11a4b0e3b583c 100644
  
  /*
 diff --git a/mm/vmscan.c b/mm/vmscan.c
-index 1080209a568bb..f76aa82682152 100644
+index 2fe4a11d63f4..445ce9324b01 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -186,7 +186,11 @@ struct scan_control {
@@ -10224,7 +12356,7 @@ index 1080209a568bb..f76aa82682152 100644
  
  LIST_HEAD(shrinker_list);
  DECLARE_RWSEM(shrinker_rwsem);
-@@ -4593,7 +4597,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
+@@ -4594,7 +4598,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc
  }
  
  /* to protect the working set of the last N jiffies */
@@ -10237,24 +12369,334 @@ index 1080209a568bb..f76aa82682152 100644
  static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  {
 -- 
-2.41.0
+2.42.0
 
-From 84774938778953b047ed348f924e2c9fae19e5cc Mon Sep 17 00:00:00 2001
+From b05442522d6f62443d6bbd57d68868d96910ee2e Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Sun, 20 Aug 2023 15:55:14 +0200
-Subject: [PATCH 4/6] fixes
+Date: Mon, 28 Aug 2023 14:02:22 +0200
+Subject: [PATCH 5/7] fixes
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- drivers/bluetooth/btusb.c       |  2 +-
- include/linux/pageblock-flags.h |  2 +-
- kernel/padata.c                 |  4 ++--
- mm/readahead.c                  | 10 +++++++++-
- sound/pci/hda/cs35l41_hda.c     |  2 +-
- 5 files changed, 14 insertions(+), 6 deletions(-)
+ Documentation/ABI/stable/sysfs-block          |   10 +
+ .../testing/sysfs-class-led-trigger-blkdev    |   78 ++
+ Documentation/leds/index.rst                  |    1 +
+ Documentation/leds/ledtrig-blkdev.rst         |  158 +++
+ block/mq-deadline.c                           |    3 +-
+ drivers/bluetooth/btusb.c                     |    2 +-
+ drivers/char/tpm/tpm_crb.c                    |   33 +-
+ drivers/leds/trigger/Kconfig                  |    9 +
+ drivers/leds/trigger/Makefile                 |    1 +
+ drivers/leds/trigger/ledtrig-blkdev.c         | 1218 +++++++++++++++++
+ drivers/pinctrl/pinctrl-amd.c                 |    4 +-
+ include/linux/pageblock-flags.h               |    2 +-
+ kernel/padata.c                               |    4 +-
+ mm/readahead.c                                |   10 +-
+ scripts/Makefile.vmlinux_o                    |    2 +-
+ sound/pci/hda/cs35l41_hda.c                   |    2 +-
+ 16 files changed, 1502 insertions(+), 35 deletions(-)
+ create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
+ create mode 100644 Documentation/leds/ledtrig-blkdev.rst
+ create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c
 
+diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
+index c57e5b7cb532..2d1df6c9b463 100644
+--- a/Documentation/ABI/stable/sysfs-block
++++ b/Documentation/ABI/stable/sysfs-block
+@@ -101,6 +101,16 @@ Description:
+ 		devices that support receiving integrity metadata.
+ 
+ 
++What:		/sys/block/<disk>/linked_leds
++Date:		January 2023
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Directory that contains symbolic links to all LEDs that
++		are associated with (linked to) this block device by the
++		blkdev LED trigger.  Only present when at least one LED
++		is linked.  (See Documentation/leds/ledtrig-blkdev.rst.)
++
++
+ What:		/sys/block/<disk>/<partition>/alignment_offset
+ Date:		April 2009
+ Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
+new file mode 100644
+index 000000000000..28ce8c814fb7
+--- /dev/null
++++ b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
+@@ -0,0 +1,78 @@
++What:		/sys/class/leds/<led>/blink_time
++Date:		January 2023
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Time (in milliseconds) that the LED will be on during a single
++		"blink".
++
++What:		/sys/class/leds/<led>/check_interval
++Date:		January 2023
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Interval (in milliseconds) between checks of the block devices
++		linked to this LED.  The LED will be blinked if the correct type
++		of activity (see blink_on_{read,write,discard,flush} attributes)
++		has occurred on any of the linked devices since the previous
++		check.
++
++What:		/sys/class/leds/<led>/blink_on_read
++Date:		January 2023
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Boolean that determines whether the LED will blink in response
++		to read activity on any of its linked block devices.
++
++What:		/sys/class/leds/<led>/blink_on_write
++Date:		January 2023
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Boolean that determines whether the LED will blink in response
++		to write activity on any of its linked block devices.
++
++What:		/sys/class/leds/<led>/blink_on_discard
++Date:		January 2023
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Boolean that determines whether the LED will blink in response
++		to discard activity on any of its linked block devices.
++
++What:		/sys/class/leds/<led>/blink_on_flush
++Date:		January 2023
++Contact:	Ian Pilcher <arequipeno@gamil.com>
++Description:
++		Boolean that determines whether the LED will blink in response
++		to cache flush activity on any of its linked block devices.
++
++What:		/sys/class/leds/<led>/link_dev_by_path
++Date:		January 2023
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Associate a block device with this LED by writing the path to
++		the device special file (e.g. /dev/sda) to this attribute.
++		Symbolic links are followed.
++
++What:		/sys/class/leds/<led>/unlink_dev_by_path
++Date:		January 2023
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Remove the association between this LED and a block device by
++		writing the path to the device special file (e.g. /dev/sda) to
++		this attribute.  Symbolic links are followed.
++
++What:		/sys/class/leds/<led>/unlink_dev_by_name
++Date:		January 2023
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Remove the association between this LED and a block device by
++		writing the kernel name of the device (e.g. sda) to this
++		attribute.
++
++What:		/sys/class/leds/<led>/linked_devices
++Date:		January 2023
++Contact:	Ian Pilcher <arequipeno@gmail.com>
++Description:
++		Directory containing links to all block devices that are
++		associated with this LED.  (Note that the names of the
++		symbolic links in this directory are *kernel* names, which
++		may not match the device special file paths written to
++		link_device and unlink_device.)
+diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst
+index 3ade16c18328..3fd55a2cbfb5 100644
+--- a/Documentation/leds/index.rst
++++ b/Documentation/leds/index.rst
+@@ -10,6 +10,7 @@ LEDs
+    leds-class
+    leds-class-flash
+    leds-class-multicolor
++   ledtrig-blkdev
+    ledtrig-oneshot
+    ledtrig-transient
+    ledtrig-usbport
+diff --git a/Documentation/leds/ledtrig-blkdev.rst b/Documentation/leds/ledtrig-blkdev.rst
+new file mode 100644
+index 000000000000..9ff5b99de451
+--- /dev/null
++++ b/Documentation/leds/ledtrig-blkdev.rst
+@@ -0,0 +1,158 @@
++.. SPDX-License-Identifier: GPL-2.0
++
++=================================
++Block Device (blkdev) LED Trigger
++=================================
++
++Available when ``CONFIG_LEDS_TRIGGER_BLKDEV=y`` or
++``CONFIG_LEDS_TRIGGER_BLKDEV=m``.
++
++See also:
++
++* ``Documentation/ABI/testing/sysfs-class-led-trigger-blkdev``
++* ``Documentation/ABI/stable/sysfs-block`` (``/sys/block/<disk>/linked_leds``)
++
++Overview
++========
++
++.. note::
++	The examples below use ``<LED>`` to refer to the name of a
++	system-specific LED.  If no suitable LED is available on a test
++	system (in a virtual machine, for example), it is possible to
++	use a userspace LED.  (See ``Documentation/leds/uleds.rst``.)
++
++Verify that the ``blkdev`` LED trigger is available::
++
++	# grep blkdev /sys/class/leds/<LED>/trigger
++	... rfkill-none blkdev
++
++(If the previous command produces no output, you may need to load the trigger
++module - ``modprobe ledtrig_blkdev``.  If the module is not available, check
++the value of ``CONFIG_LEDS_TRIGGER_BLKDEV`` in your kernel configuration.)
++
++Associate the LED with the ``blkdev`` LED trigger::
++
++	# echo blkdev > /sys/class/leds/<LED>/trigger
++
++	# cat /sys/class/leds/<LED>/trigger
++	... rfkill-none [blkdev]
++
++Note that several new device attributes are available in the
++``/sys/class/leds/<LED>`` directory.
++
++* ``link_dev_by_path``, ``unlink_dev_by_path``, and ``unlink_dev_by_name`` are
++  used to manage the set of block devices associated with this LED.  The LED
++  will blink when activity occurs on any of its linked devices.
++
++* ``blink_on_read``, ``blink_on_write``, ``blink_on_discard``, and
++  ``blink_on_flush`` are boolean values that determine whether the LED will
++  blink when a particular type of activity is detected on one of its linked
++  block devices.
++
++* ``blink_time`` is the duration (in milliseconds) of each blink of this LED.
++  (The minimum value is 10 milliseconds.)
++
++* ``check_interval`` is the frequency (in milliseconds) with which block devices
++  linked to this LED will be checked for activity and the LED blinked (if the
++  correct type of activity has occurred).
++
++* The ``linked_devices`` directory will contain a symbolic link to every device
++  that is associated with this LED.
++
++Link a block device to the LED::
++
++	# echo /dev/sda > /sys/class/leds/<LED>/link_dev_by_path
++
++	# ls /sys/class/leds/<LED>/linked_devices
++	sda
++
++(The value written to ``link_dev_by_path`` must be the path of the device
++special file, such as ``/dev/sda``, that represents the block device - or the
++path of a symbolic link to such a device special file.)
++
++Activity on the device will now cause the LED to blink.  The duration of each
++blink (in milliseconds) can be adjusted by setting
++``/sys/class/leds/<LED>/blink_time``.  (But see **check_interval and
++blink_time** below.)
++
++Associate a second device with the LED::
++
++	# echo /dev/sdb > /sys/class/leds/<LED>/link_dev_by_path
++
++	# ls /sys/class/leds/<LED>/linked_devices
++	sda  sdb
++
++When a block device is linked to one or more LEDs, the LEDs are linked from
++the device's ``linked_leds`` directory::
++
++	# ls /sys/class/block/sd{a,b}/linked_leds
++	/sys/class/block/sda/linked_leds:
++	<LED>
++
++	/sys/class/block/sdb/linked_leds:
++	<LED>
++
++(The ``linked_leds`` directory only exists when the block device is linked to
++at least one LED.)
++
++``check_interval`` and ``blink_time``
++=====================================
++
++* By default, linked block devices are checked for activity every 100
++  milliseconds.  This frequency can be changed for an LED via the
++  ``/sys/class/leds/<led>/check_interval`` attribute.  (The minimum value is 25
++  milliseconds.)
++
++* All block devices associated with an LED are checked for activity every
++  ``check_interval`` milliseconds, and a blink is triggered if the correct type
++  of activity (as determined by the LED's ``blink_on_*`` attributes) is
++  detected.  The duration of an LED's blink is determined by its ``blink_time``
++  attribute.  Thus (when the correct type of activity is detected), the LED will
++  be on for ``blink_time`` milliseconds and off for
++  ``check_interval - blink_time`` milliseconds.
++
++* The LED subsystem ignores new blink requests for an LED that is already in
++  in the process of blinking, so setting a ``blink_time`` greater than or equal
++  to ``check_interval`` will cause some blinks to be missed.
++
++* Because of processing times, scheduling latencies, etc., avoiding missed
++  blinks actually requires a difference of at least a few milliseconds between
++  the ``blink_time`` and ``check_interval``.  The required difference is likely
++  to vary from system to system.  As a  reference, a Thecus N5550 NAS requires a
++  difference of 7 milliseconds (e.g. ``check_interval == 100``,
++  ``blink_time == 93``).
++
++* The default values (``check_interval == 100``, ``blink_time == 75``) cause the
++  LED associated with a continuously active device to blink rapidly.  For a more
++  "always on" effect, increase the ``blink_time`` (but not too much; see the
++  previous bullet).
++
++Other Notes
++===========
++
++* Many (possibly all) types of block devices work with this trigger, including:
++
++  * SCSI (including SATA and USB) hard disk drives and SSDs
++  * SCSI (including SATA and USB) optical drives
++  * NVMe SSDs
++  * SD cards
++  * loopback block devices (``/dev/loop*``)
++  * device mapper devices, such as LVM logical volumes
++  * MD RAID devices
++  * zRAM compressed RAM-disks
++  * partitions on block devices that support them
++
++* The names of the symbolic links in ``/sys/class/leds/<LED>/linked_devices``
++  are **kernel** names, which may not match the paths used for
++  ``link_dev_by_path`` and ``unlink_dev_by_path``.  This is most likely when a
++  symbolic link is used to refer to the device (as is common with logical
++  volumes), but it can be true for any device, because nothing prevents the
++  creation of device special files with arbitrary names (e.g.
++  ``sudo mknod /foo b 8 0``).
++
++  Kernel names can be used to unlink block devices from LEDs by writing them to
++  the LED's ``unlink_dev_by_name`` attribute.
++
++* The ``blkdev`` LED trigger supports many-to-many device/LED associations.
++  A device can be associated with multiple LEDs, and an LED can be associated
++  with multiple devices.
+diff --git a/block/mq-deadline.c b/block/mq-deadline.c
+index 02a916ba62ee..f958e79277b8 100644
+--- a/block/mq-deadline.c
++++ b/block/mq-deadline.c
+@@ -646,8 +646,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
+ 	struct request_queue *q = hctx->queue;
+ 	struct deadline_data *dd = q->elevator->elevator_data;
+ 	struct blk_mq_tags *tags = hctx->sched_tags;
++	unsigned int shift = tags->bitmap_tags.sb.shift;
+ 
+-	dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
++	dd->async_depth = max(1U, 3 * (1U << shift)  / 4);
+ 
+ 	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
+ }
 diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
-index 764d176e97351..deb10b89fa51f 100644
+index 764d176e9735..deb10b89fa51 100644
 --- a/drivers/bluetooth/btusb.c
 +++ b/drivers/bluetooth/btusb.c
 @@ -945,7 +945,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev)
@@ -10266,8 +12708,1332 @@ index 764d176e97351..deb10b89fa51f 100644
  		gpiod_set_value_cansleep(reset_gpio, 1);
  
  		return;
+diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c
+index 9eb1a1859012..a5dbebb1acfc 100644
+--- a/drivers/char/tpm/tpm_crb.c
++++ b/drivers/char/tpm/tpm_crb.c
+@@ -463,28 +463,6 @@ static bool crb_req_canceled(struct tpm_chip *chip, u8 status)
+ 	return (cancel & CRB_CANCEL_INVOKE) == CRB_CANCEL_INVOKE;
+ }
+ 
+-static int crb_check_flags(struct tpm_chip *chip)
+-{
+-	u32 val;
+-	int ret;
+-
+-	ret = crb_request_locality(chip, 0);
+-	if (ret)
+-		return ret;
+-
+-	ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val, NULL);
+-	if (ret)
+-		goto release;
+-
+-	if (val == 0x414D4400U /* AMD */)
+-		chip->flags |= TPM_CHIP_FLAG_HWRNG_DISABLED;
+-
+-release:
+-	crb_relinquish_locality(chip, 0);
+-
+-	return ret;
+-}
+-
+ static const struct tpm_class_ops tpm_crb = {
+ 	.flags = TPM_OPS_AUTO_STARTUP,
+ 	.status = crb_status,
+@@ -826,9 +804,14 @@ static int crb_acpi_add(struct acpi_device *device)
+ 	if (rc)
+ 		goto out;
+ 
+-	rc = crb_check_flags(chip);
+-	if (rc)
+-		goto out;
++#ifdef CONFIG_X86
++	/* A quirk for https://www.amd.com/en/support/kb/faq/pa-410 */
++	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
++	    priv->sm != ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON) {
++		dev_info(dev, "Disabling hwrng\n");
++		chip->flags |= TPM_CHIP_FLAG_HWRNG_DISABLED;
++	}
++#endif /* CONFIG_X86 */
+ 
+ 	rc = tpm_chip_register(chip);
+ 
+diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
+index 2a57328eca20..05e80cfd0ed8 100644
+--- a/drivers/leds/trigger/Kconfig
++++ b/drivers/leds/trigger/Kconfig
+@@ -155,4 +155,13 @@ config LEDS_TRIGGER_TTY
+ 
+ 	  When build as a module this driver will be called ledtrig-tty.
+ 
++config LEDS_TRIGGER_BLKDEV
++	tristate "LED Trigger for block devices"
++	depends on BLOCK
++	help
++	  The blkdev LED trigger allows LEDs to be controlled by block device
++	  activity (reads and writes).
++
++	  See Documentation/leds/ledtrig-blkdev.rst.
++
+ endif # LEDS_TRIGGERS
+diff --git a/drivers/leds/trigger/Makefile b/drivers/leds/trigger/Makefile
+index 25c4db97cdd4..d53bab5d93f1 100644
+--- a/drivers/leds/trigger/Makefile
++++ b/drivers/leds/trigger/Makefile
+@@ -16,3 +16,4 @@ obj-$(CONFIG_LEDS_TRIGGER_NETDEV)	+= ledtrig-netdev.o
+ obj-$(CONFIG_LEDS_TRIGGER_PATTERN)	+= ledtrig-pattern.o
+ obj-$(CONFIG_LEDS_TRIGGER_AUDIO)	+= ledtrig-audio.o
+ obj-$(CONFIG_LEDS_TRIGGER_TTY)		+= ledtrig-tty.o
++obj-$(CONFIG_LEDS_TRIGGER_BLKDEV)	+= ledtrig-blkdev.o
+diff --git a/drivers/leds/trigger/ledtrig-blkdev.c b/drivers/leds/trigger/ledtrig-blkdev.c
+new file mode 100644
+index 000000000000..9e0c4b66ea27
+--- /dev/null
++++ b/drivers/leds/trigger/ledtrig-blkdev.c
+@@ -0,0 +1,1218 @@
++// SPDX-License-Identifier: GPL-2.0-only
++
++/*
++ *	Block device LED trigger
++ *
++ *	Copyright 2021-2023 Ian Pilcher <arequipeno@gmail.com>
++ */
++
++#include <linux/blkdev.h>
++#include <linux/leds.h>
++#include <linux/module.h>
++#include <linux/part_stat.h>
++#include <linux/xarray.h>
++
++/**
++ * DOC: Overview
++ *
++ * The ``blkdev`` LED trigger works by periodically checking the activity
++ * counters of block devices that have been linked to one or more LEDs and
++ * blinking those LED(s) if the correct type of activity has occurred.  The
++ * periodic check is scheduled with the Linux kernel's deferred work facility.
++ *
++ * Trigger-specific data about block devices and LEDs is stored in two data
++ * structures --- &struct blkdev_trig_bdev (a "BTB") and &struct blkdev_trig_led
++ * (a "BTL").  Each structure contains a &struct xarray that holds links to any
++ * linked devices of the other type.  I.e. &blkdev_trig_bdev.linked_btls
++ * contains links to all BTLs whose LEDs have been linked to the BTB's block
++ * device, and &blkdev_trig_led.linked_btbs contains links to all BTBs whose
++ * block devices have been linked to the BTL's LED.  Thus, a block device can
++ * be linked to more than one LED, and an LED can be linked to more than one
++ * block device.
++ */
++
++/* Default, minimum & maximum blink duration (milliseconds) */
++#define BLKDEV_TRIG_BLINK_DEF	75
++#define BLKDEV_TRIG_BLINK_MIN	10
++#define BLKDEV_TRIG_BLINK_MAX	86400000  /* 24 hours */
++
++/* Default, minimum & maximum activity check interval (milliseconds) */
++#define BLKDEV_TRIG_CHECK_DEF	100
++#define BLKDEV_TRIG_CHECK_MIN	25
++#define BLKDEV_TRIG_CHECK_MAX	86400000  /* 24 hours */
++
++/*
++ * If blkdev_trig_check() can't lock the mutex, how long to wait before trying
++ * again (milliseconds)
++ */
++#define BLKDEV_TRIG_CHECK_RETRY	5
++
++/**
++ * struct blkdev_trig_bdev - Trigger-specific data about a block device.
++ * @last_checked:	Time (in jiffies) at which the trigger last checked this
++ *			block device for activity.
++ * @last_activity:	Time (in jiffies) at which the trigger last detected
++ *			activity of each type.
++ * @ios:		Activity counter values for each type, corresponding to
++ *			the timestamps in &last_activity.
++ * @index:		&xarray index, so the BTB can be included in one or more
++ *			&blkdev_trig_led.linked_btbs.
++ * @bdev:		The block device.
++ * @linked_btls:	The BTLs that represent the LEDs linked to the BTB's
++ *			block device.
++ *
++ * Every block device linked to at least one LED gets a "BTB."  A BTB is created
++ * when a block device that is not currently linked to any LEDs is linked to an
++ * LED.
++ *
++ * A BTB is freed when one of the following occurs:
++ *
++ * * The number of LEDs linked to the block device becomes zero, because it has
++ *   been unlinked from its last LED using the trigger's &sysfs interface.
++ *
++ * * The number of LEDs linked to the block device becomes zero, because the
++ *   last LED to which it was linked has been disassociated from the trigger
++ *   (which happens automatically if the LED device is removed from the system).
++ *
++ * * The BTB's block device is removed from the system.  To accomodate this
++ *   scenario, BTB's are created as device resources, so that the release
++ *   function will be called by the driver core when the device is removed.
++ */
++struct blkdev_trig_bdev {
++	unsigned long		last_checked;
++	unsigned long		last_activity[NR_STAT_GROUPS];
++	unsigned long		ios[NR_STAT_GROUPS];
++	unsigned long		index;
++	struct block_device	*bdev;
++	struct xarray		linked_btls;
++};
++
++/**
++ * struct blkdev_trig_led - Trigger-specific data about an LED.
++ * @last_checked:	Time (in jiffies) at which the trigger last checked the
++ *			the block devices linked to this LED for activity.
++ * @index:		&xarray index, so the BTL can be included in one or more
++ *			&blkdev_trig_bdev.linked_btls.
++ * @mode:		Bitmask for types of block device activity that will
++ *			cause this LED to blink --- reads, writes, discards,
++ *			etc.
++ * @led:		The LED device.
++ * @blink_msec:		Duration of a blink (milliseconds).
++ * @check_jiffies:	Frequency with which block devices linked to this LED
++ *			should be checked for activity (jiffies).
++ * @linked_btbs:	The BTBs that represent the block devices linked to the
++ *			BTL's LED.
++ * @all_btls_node:	The BTL's node in the module's list of all BTLs.
++ *
++ * Every LED associated with the block device trigger gets a "BTL."  A BTL is
++ * created when the trigger is "activated" on an LED (usually by writing
++ * ``blkdev`` to the LED's &sysfs &trigger attribute).  A BTL is freed wnen its
++ * LED is disassociated from the trigger, either through the trigger's &sysfs
++ * interface or because the LED device is removed from the system.
++ */
++struct blkdev_trig_led {
++	unsigned long		last_checked;
++	unsigned long		index;
++	unsigned long		mode;  /* must be ulong for atomic bit ops */
++	struct led_classdev	*led;
++	unsigned int		blink_msec;
++	unsigned int		check_jiffies;
++	struct xarray		linked_btbs;
++	struct hlist_node	all_btls_node;
++};
++
++/* Protects everything except atomic LED attributes */
++static DEFINE_MUTEX(blkdev_trig_mutex);
++
++/* BTB device resource release function */
++static void blkdev_trig_btb_release(struct device *dev, void *res);
++
++/* Index for next BTB or BTL */
++static unsigned long blkdev_trig_next_index;
++
++/* All LEDs associated with the trigger */
++static HLIST_HEAD(blkdev_trig_all_btls);
++
++/* Delayed work to periodically check for activity & blink LEDs */
++static void blkdev_trig_check(struct work_struct *work);
++static DECLARE_DELAYED_WORK(blkdev_trig_work, blkdev_trig_check);
++
++/* When is the delayed work scheduled to run next (jiffies) */
++static unsigned long blkdev_trig_next_check;
++
++/* Total number of BTB-to-BTL links */
++static unsigned int blkdev_trig_link_count;
++
++/* Empty sysfs attribute list for next 2 declarations */
++static struct attribute *blkdev_trig_attrs_empty[] = { NULL };
++
++/* linked_leds sysfs directory for block devs linked to 1 or more LEDs */
++static const struct attribute_group blkdev_trig_linked_leds = {
++	.name	= "linked_leds",
++	.attrs	= blkdev_trig_attrs_empty,
++};
++
++/* linked_devices sysfs directory for each LED associated with the trigger */
++static const struct attribute_group blkdev_trig_linked_devs = {
++	.name	= "linked_devices",
++	.attrs	= blkdev_trig_attrs_empty,
++};
++
++
++/*
++ *
++ *	Delayed work to check for activity & blink LEDs
++ *
++ */
++
++/**
++ * blkdev_trig_blink() - Blink an LED, if the correct type of activity has
++ *	occurred on the block device.
++ * @btl:	The BTL that represents the LED
++ * @btb:	The BTB that represents the block device
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ * Return:	&true if the LED is blinked, &false if not.
++ */
++static bool blkdev_trig_blink(const struct blkdev_trig_led *btl,
++			      const struct blkdev_trig_bdev *btb)
++{
++	unsigned long mode, mask, delay_on, delay_off;
++	enum stat_group i;
++
++	mode = READ_ONCE(btl->mode);
++
++	for (i = STAT_READ, mask = 1; i <= STAT_FLUSH; ++i, mask <<= 1) {
++
++		if (!(mode & mask))
++			continue;
++
++		if (time_before_eq(btb->last_activity[i], btl->last_checked))
++			continue;
++
++		delay_on = READ_ONCE(btl->blink_msec);
++		delay_off = 1;	/* 0 leaves LED turned on */
++
++		led_blink_set_oneshot(btl->led, &delay_on, &delay_off, 0);
++		return true;
++	}
++
++	return false;
++}
++
++/**
++ * blkdev_trig_update_btb() - Update a BTB's activity counters and timestamps.
++ * @btb:	The BTB
++ * @now:	Timestamp (in jiffies)
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ */
++static void blkdev_trig_update_btb(struct blkdev_trig_bdev *btb,
++				   unsigned long now)
++{
++	unsigned long new_ios;
++	enum stat_group i;
++
++	for (i = STAT_READ; i <= STAT_FLUSH; ++i) {
++
++		new_ios = part_stat_read(btb->bdev, ios[i]);
++
++		if (new_ios != btb->ios[i]) {
++			btb->ios[i] = new_ios;
++			btb->last_activity[i] = now;
++		}
++	}
++
++	btb->last_checked = now;
++}
++
++/**
++ * blkdev_trig_check() - Check linked devices for activity and blink LEDs.
++ * @work:	Delayed work (&blkdev_trig_work)
++ *
++ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
++ */
++static void blkdev_trig_check(struct work_struct *work)
++{
++	struct blkdev_trig_led *btl;
++	struct blkdev_trig_bdev *btb;
++	unsigned long index, delay, now, led_check, led_delay;
++	bool blinked;
++
++	if (!mutex_trylock(&blkdev_trig_mutex)) {
++		delay = msecs_to_jiffies(BLKDEV_TRIG_CHECK_RETRY);
++		goto exit_reschedule;
++	}
++
++	now = jiffies;
++	delay = ULONG_MAX;
++
++	hlist_for_each_entry (btl, &blkdev_trig_all_btls, all_btls_node) {
++
++		led_check = btl->last_checked + btl->check_jiffies;
++
++		if (time_before_eq(led_check, now)) {
++
++			blinked = false;
++
++			xa_for_each (&btl->linked_btbs, index, btb) {
++
++				if (btb->last_checked != now)
++					blkdev_trig_update_btb(btb, now);
++				if (!blinked)
++					blinked = blkdev_trig_blink(btl, btb);
++			}
++
++			btl->last_checked = now;
++			led_delay = btl->check_jiffies;
++
++		} else {
++			led_delay = led_check - now;
++		}
++
++		if (led_delay < delay)
++			delay = led_delay;
++	}
++
++	mutex_unlock(&blkdev_trig_mutex);
++
++exit_reschedule:
++	WARN_ON_ONCE(delay == ULONG_MAX);
++	WARN_ON_ONCE(!schedule_delayed_work(&blkdev_trig_work, delay));
++}
++
++/**
++ * blkdev_trig_sched_led() - Set the schedule of the delayed work when a new
++ *	LED is added to the schedule.
++ * @btl:	The BTL that represents the LED
++ *
++ * Called when the number of block devices to which an LED is linked becomes
++ * non-zero.
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ */
++static void blkdev_trig_sched_led(const struct blkdev_trig_led *btl)
++{
++	unsigned long delay = READ_ONCE(btl->check_jiffies);
++	unsigned long check_by = jiffies + delay;
++
++	/*
++	 * If no other LED-to-block device links exist, simply schedule the
++	 * delayed work according to this LED's check_interval attribute
++	 * (check_jiffies).
++	 */
++	if (blkdev_trig_link_count == 0) {
++		WARN_ON(!schedule_delayed_work(&blkdev_trig_work, delay));
++		blkdev_trig_next_check = check_by;
++		return;
++	}
++
++	/*
++	 * If the next check is already scheduled to occur soon enough to
++	 * accomodate this LED's check_interval, the schedule doesn't need
++	 * to be changed.
++	 */
++	if (time_after_eq(check_by, blkdev_trig_next_check))
++		return;
++
++	/*
++	 * Modify the schedule, so that the delayed work runs soon enough for
++	 * this LED.
++	 */
++	WARN_ON(!mod_delayed_work(system_wq, &blkdev_trig_work, delay));
++	blkdev_trig_next_check = check_by;
++}
++
++
++/*
++ *
++ *	Linking and unlinking LEDs and block devices
++ *
++ */
++
++/**
++ * blkdev_trig_link() - Link a block device to an LED.
++ * @btl:	The BTL that represents the LED
++ * @btb:	The BTB that represents the block device
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ * Return:	&0 on success, negative &errno on error.
++ */
++static int blkdev_trig_link(struct blkdev_trig_led *btl,
++			    struct blkdev_trig_bdev *btb)
++{
++	bool led_first_link;
++	int err;
++
++	led_first_link = xa_empty(&btl->linked_btbs);
++
++	err = xa_insert(&btb->linked_btls, btl->index, btl, GFP_KERNEL);
++	if (err)
++		return err;
++
++	err = xa_insert(&btl->linked_btbs, btb->index, btb, GFP_KERNEL);
++	if (err)
++		goto error_erase_btl;
++
++	/* Create /sys/class/block/<bdev>/linked_leds/<led> symlink */
++	err = sysfs_add_link_to_group(bdev_kobj(btb->bdev),
++				      blkdev_trig_linked_leds.name,
++				      &btl->led->dev->kobj, btl->led->name);
++	if (err)
++		goto error_erase_btb;
++
++	/* Create /sys/class/leds/<led>/linked_devices/<bdev> symlink */
++	err = sysfs_add_link_to_group(&btl->led->dev->kobj,
++				      blkdev_trig_linked_devs.name,
++				      bdev_kobj(btb->bdev),
++				      dev_name(&btb->bdev->bd_device));
++	if (err)
++		goto error_remove_symlink;
++
++	/*
++	 * If this is the first block device linked to this LED, the delayed
++	 * work schedule may need to be changed.
++	 */
++	if (led_first_link)
++		blkdev_trig_sched_led(btl);
++
++	++blkdev_trig_link_count;
++
++	return 0;
++
++error_remove_symlink:
++	sysfs_remove_link_from_group(bdev_kobj(btb->bdev),
++				     blkdev_trig_linked_leds.name,
++				     btl->led->name);
++error_erase_btb:
++	xa_erase(&btl->linked_btbs, btb->index);
++error_erase_btl:
++	xa_erase(&btb->linked_btls, btl->index);
++	return err;
++}
++
++/**
++ * blkdev_trig_put_btb() - Remove and free a BTB, if it is no longer needed.
++ * @btb:	The BTB
++ *
++ * Does nothing if the BTB (block device) is still linked to at least one LED.
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ */
++static void blkdev_trig_put_btb(struct blkdev_trig_bdev *btb)
++{
++	struct block_device *bdev = btb->bdev;
++	int err;
++
++	if (xa_empty(&btb->linked_btls)) {
++
++		sysfs_remove_group(bdev_kobj(bdev), &blkdev_trig_linked_leds);
++		err = devres_destroy(&bdev->bd_device, blkdev_trig_btb_release,
++				     NULL, NULL);
++		WARN_ON(err);
++	}
++}
++
++/**
++ * _blkdev_trig_unlink_always() - Perform the unconditionally required steps of
++ *	unlinking a block device from an LED.
++ * @btl:	The BTL that represents the LED
++ * @btb:	The BTB that represents the block device
++ *
++ * When a block device is unlinked from an LED, certain steps must be performed
++ * only if the block device is **not** being released.  This function performs
++ * those steps that are **always** required, whether or not the block device is
++ * being released.
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ */
++static void _blkdev_trig_unlink_always(struct blkdev_trig_led *btl,
++				       struct blkdev_trig_bdev *btb)
++{
++	--blkdev_trig_link_count;
++
++	if (blkdev_trig_link_count == 0)
++		WARN_ON(!cancel_delayed_work_sync(&blkdev_trig_work));
++
++	xa_erase(&btb->linked_btls, btl->index);
++	xa_erase(&btl->linked_btbs, btb->index);
++
++	/* Remove /sys/class/leds/<led>/linked_devices/<bdev> symlink */
++	sysfs_remove_link_from_group(&btl->led->dev->kobj,
++				     blkdev_trig_linked_devs.name,
++				     dev_name(&btb->bdev->bd_device));
++}
++
++/**
++ * blkdev_trig_unlink_norelease() - Unlink an LED from a block device that is
++ *	**not** being released.
++ * @btl:	The BTL that represents the LED.
++ * @btb:	The BTB that represents the block device.
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ */
++static void blkdev_trig_unlink_norelease(struct blkdev_trig_led *btl,
++					 struct blkdev_trig_bdev *btb)
++{
++	_blkdev_trig_unlink_always(btl, btb);
++
++	/* Remove /sys/class/block/<bdev>/linked_leds/<led> symlink */
++	sysfs_remove_link_from_group(bdev_kobj(btb->bdev),
++				     blkdev_trig_linked_leds.name,
++				     btl->led->name);
++
++	blkdev_trig_put_btb(btb);
++}
++
++/**
++ * blkdev_trig_unlink_release() - Unlink an LED from a block device that is
++ *	being released.
++ * @btl:	The BTL that represents the LED
++ * @btb:	The BTB that represents the block device
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ */
++static void blkdev_trig_unlink_release(struct blkdev_trig_led *btl,
++				       struct blkdev_trig_bdev *btb)
++{
++	_blkdev_trig_unlink_always(btl, btb);
++
++	/*
++	 * If the BTB is being released, the driver core has already removed the
++	 * device's attribute groups, and the BTB will be freed automatically,
++	 * so there's nothing else to do.
++	 */
++}
++
++
++/*
++ *
++ *	BTB creation
++ *
++ */
++
++/**
++ * blkdev_trig_btb_release() - BTB device resource release function.
++ * @dev:	The block device
++ * @res:	The BTB
++ *
++ * Called by the driver core when a block device with a BTB is removed.
++ *
++ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
++ */
++static void blkdev_trig_btb_release(struct device *dev, void *res)
++{
++	struct blkdev_trig_bdev *btb = res;
++	struct blkdev_trig_led *btl;
++	unsigned long index;
++
++	mutex_lock(&blkdev_trig_mutex);
++
++	xa_for_each (&btb->linked_btls, index, btl)
++		blkdev_trig_unlink_release(btl, btb);
++
++	mutex_unlock(&blkdev_trig_mutex);
++}
++
++/**
++ * blkdev_trig_get_bdev() - Get a block device by path.
++ * @path:	The value written to an LED's &link_dev_by_path or
++ *		&unlink_dev_by_path attribute, which should be the path to a
++ *		special file that represents a block device
++ * @len:	The number of characters in &path (not including its
++ *		terminating null)
++ *
++ * The caller must call blkdev_put() when finished with the device.
++ *
++ * Context:	Process context.
++ * Return:	The block device, or an error pointer.
++ */
++static struct block_device *blkdev_trig_get_bdev(const char *path, size_t len)
++{
++	struct block_device *bdev;
++	char *buf;
++
++	buf = kmemdup(path, len + 1, GFP_KERNEL);  /* +1 to include null */
++	if (buf == NULL)
++		return ERR_PTR(-ENOMEM);
++
++	bdev = blkdev_get_by_path(strim(buf), 0, NULL, NULL);
++	kfree(buf);
++	return bdev;
++}
++
++/**
++ * blkdev_trig_get_btb() - Find or create the BTB for a block device.
++ * @path:	The value written to an LED's &link_dev_by_path attribute,
++ *		which should be the path to a special file that represents a
++ *		block device
++ * @len:	The number of characters in &path
++ *
++ * If a new BTB is created, because the block device was not previously linked
++ * to any LEDs, the block device's &linked_leds &sysfs directory is created.
++ *
++ * Context:	Process context.  Caller must hold &blkdev_trig_mutex.
++ * Return:	Pointer to the BTB, error pointer on error.
++ */
++static struct blkdev_trig_bdev *blkdev_trig_get_btb(const char *path,
++						    size_t len)
++{
++	struct block_device *bdev;
++	struct blkdev_trig_bdev *btb;
++	int err;
++
++	bdev = blkdev_trig_get_bdev(path, len);
++	if (IS_ERR(bdev))
++		return ERR_CAST(bdev);
++
++	btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release,
++			  NULL, NULL);
++	if (btb != NULL) {
++		err = 0;
++		goto exit_put_bdev;
++	}
++
++	if (blkdev_trig_next_index == ULONG_MAX) {
++		err = -EOVERFLOW;
++		goto exit_put_bdev;
++	}
++
++	btb = devres_alloc(blkdev_trig_btb_release, sizeof(*btb), GFP_KERNEL);
++	if (btb == NULL) {
++		err = -ENOMEM;
++		goto exit_put_bdev;
++	}
++
++	err = sysfs_create_group(bdev_kobj(bdev), &blkdev_trig_linked_leds);
++	if (err)
++		goto exit_free_btb;
++
++	btb->index = blkdev_trig_next_index++;
++	btb->bdev = bdev;
++	xa_init(&btb->linked_btls);
++
++	/* Populate BTB activity counters */
++	blkdev_trig_update_btb(btb, jiffies);
++
++	devres_add(&bdev->bd_device, btb);
++
++exit_free_btb:
++	if (err)
++		devres_free(btb);
++exit_put_bdev:
++	blkdev_put(bdev, NULL);
++	return err ? ERR_PTR(err) : btb;
++}
++
++
++/*
++ *
++ *	Activating and deactivating the trigger on an LED
++ *
++ */
++
++/**
++ * blkdev_trig_activate() - Called by the LEDs subsystem when an LED is
++ *	associated with the trigger.
++ * @led:	The LED
++ *
++ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
++ * Return:	&0 on success, negative &errno on error.
++ */
++static int blkdev_trig_activate(struct led_classdev *led)
++{
++	struct blkdev_trig_led *btl;
++	int err;
++
++	btl = kzalloc(sizeof(*btl), GFP_KERNEL);
++	if (btl == NULL)
++		return -ENOMEM;
++
++	err = mutex_lock_interruptible(&blkdev_trig_mutex);
++	if (err)
++		goto exit_free;
++
++	if (blkdev_trig_next_index == ULONG_MAX) {
++		err = -EOVERFLOW;
++		goto exit_unlock;
++	}
++
++	btl->index = blkdev_trig_next_index++;
++	btl->last_checked = jiffies;
++	btl->mode = -1;  /* set all bits */
++	btl->led = led;
++	btl->blink_msec = BLKDEV_TRIG_BLINK_DEF;
++	btl->check_jiffies = msecs_to_jiffies(BLKDEV_TRIG_CHECK_DEF);
++	xa_init(&btl->linked_btbs);
++
++	hlist_add_head(&btl->all_btls_node, &blkdev_trig_all_btls);
++	led_set_trigger_data(led, btl);
++
++exit_unlock:
++	mutex_unlock(&blkdev_trig_mutex);
++exit_free:
++	if (err)
++		kfree(btl);
++	return err;
++}
++
++/**
++ * blkdev_trig_deactivate() - Called by the the LEDs subsystem when an LED is
++ *	disassociated from the trigger.
++ * @led:	The LED
++ *
++ * The LEDs subsystem also calls this function when an LED associated with the
++ * trigger is removed or when the trigger is unregistered (if the module is
++ * unloaded).
++ *
++ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
++ */
++static void blkdev_trig_deactivate(struct led_classdev *led)
++{
++	struct blkdev_trig_led *btl = led_get_trigger_data(led);
++	struct blkdev_trig_bdev *btb;
++	unsigned long index;
++
++	mutex_lock(&blkdev_trig_mutex);
++
++	xa_for_each (&btl->linked_btbs, index, btb)
++		blkdev_trig_unlink_norelease(btl, btb);
++
++	hlist_del(&btl->all_btls_node);
++	kfree(btl);
++
++	mutex_unlock(&blkdev_trig_mutex);
++}
++
++
++/*
++ *
++ *	Link-related attribute store functions
++ *
++ */
++
++/**
++ * link_dev_by_path_store() - &link_dev_by_path device attribute store function.
++ * @dev:	The LED device
++ * @attr:	The &link_dev_by_path attribute (&dev_attr_link_dev_by_path)
++ * @buf:	The value written to the attribute, which should be the path to
++ *		a special file that represents a block device to be linked to
++ *		the LED (e.g. ``/dev/sda``)
++ * @count:	The number of characters in &buf
++ *
++ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t link_dev_by_path_store(struct device *dev,
++				      struct device_attribute *attr,
++				      const char *buf, size_t count)
++{
++	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
++	struct blkdev_trig_bdev *btb;
++	int err;
++
++	err = mutex_lock_interruptible(&blkdev_trig_mutex);
++	if (err)
++		return err;
++
++	btb = blkdev_trig_get_btb(buf, count);
++	if (IS_ERR(btb)) {
++		err = PTR_ERR(btb);
++		goto exit_unlock;
++	}
++
++	if (xa_load(&btb->linked_btls, btl->index) != NULL) {
++		err = -EEXIST;
++		goto exit_put_btb;
++	}
++
++	err = blkdev_trig_link(btl, btb);
++
++exit_put_btb:
++	if (err)
++		blkdev_trig_put_btb(btb);
++exit_unlock:
++	mutex_unlock(&blkdev_trig_mutex);
++	return err ? : count;
++}
++
++/**
++ * unlink_dev_by_path_store() - &unlink_dev_by_path device attribute store
++ *	function.
++ * @dev:	The LED device
++ * @attr:	The &unlink_dev_by_path attribute (&dev_attr_unlink_dev_by_path)
++ * @buf:	The value written to the attribute, which should be the path to
++ *		a special file that represents a block device to be unlinked
++ *		from the LED (e.g. ``/dev/sda``)
++ * @count:	The number of characters in &buf
++ *
++ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t unlink_dev_by_path_store(struct device *dev,
++					struct device_attribute *attr,
++					const char *buf, size_t count)
++{
++	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
++	struct block_device *bdev;
++	struct blkdev_trig_bdev *btb;
++	int err;
++
++	bdev = blkdev_trig_get_bdev(buf, count);
++	if (IS_ERR(bdev))
++		return PTR_ERR(bdev);
++
++	err = mutex_lock_interruptible(&blkdev_trig_mutex);
++	if (err)
++		goto exit_put_bdev;
++
++	btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release,
++			  NULL, NULL);
++	if (btb == NULL) {
++		err = -EUNATCH;  /* bdev isn't linked to any LED */
++		goto exit_unlock;
++	}
++
++	if (xa_load(&btb->linked_btls, btl->index) == NULL) {
++		err = -EUNATCH;  /* bdev isn't linked to this LED */
++		goto exit_unlock;
++	}
++
++	blkdev_trig_unlink_norelease(btl, btb);
++
++exit_unlock:
++	mutex_unlock(&blkdev_trig_mutex);
++exit_put_bdev:
++	blkdev_put(bdev, NULL);
++	return err ? : count;
++}
++
++/**
++ * unlink_dev_by_name_store() - &unlink_dev_by_name device attribute store
++ *	function.
++ * @dev:	The LED device
++ * @attr:	The &unlink_dev_by_name attribute (&dev_attr_unlink_dev_by_name)
++ * @buf:	The value written to the attribute, which should be the kernel
++ *		name of a block device to be unlinked from the LED (e.g.
++ *		``sda``)
++ * @count:	The number of characters in &buf
++ *
++ * Context:	Process context.  Takes and releases &blkdev_trig_mutex.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t unlink_dev_by_name_store(struct device *dev,
++					struct device_attribute *attr,
++					const char *buf, size_t count)
++{
++	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
++	struct blkdev_trig_bdev *btb;
++	unsigned long index;
++	int err;
++
++	err = mutex_lock_interruptible(&blkdev_trig_mutex);
++	if (err)
++		return err;
++
++	err = -EUNATCH;
++
++	xa_for_each (&btl->linked_btbs, index, btb) {
++
++		if (sysfs_streq(dev_name(&btb->bdev->bd_device), buf)) {
++			blkdev_trig_unlink_norelease(btl, btb);
++			err = 0;
++			break;
++		}
++	}
++
++	mutex_unlock(&blkdev_trig_mutex);
++	return err ? : count;
++}
++
++
++/*
++ *
++ *	Atomic attribute show & store functions
++ *
++ */
++
++/**
++ * blink_time_show() - &blink_time device attribute show function.
++ * @dev:	The LED device
++ * @attr:	The &blink_time attribute (&dev_attr_blink_time)
++ * @buf:	Output buffer
++ *
++ * Writes the value of &blkdev_trig_led.blink_msec to &buf.
++ *
++ * Context:	Process context.
++ * Return:	The number of characters written to &buf.
++ */
++static ssize_t blink_time_show(struct device *dev,
++			       struct device_attribute *attr, char *buf)
++{
++	const struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
++
++	return sysfs_emit(buf, "%u\n", READ_ONCE(btl->blink_msec));
++}
++
++/**
++ * blink_time_store() - &blink_time device attribute store function.
++ * @dev:	The LED device
++ * @attr:	The &blink_time attribute (&dev_attr_blink_time)
++ * @buf:	The new value (as written to the &sysfs attribute)
++ * @count:	The number of characters in &buf
++ *
++ * Sets &blkdev_trig_led.blink_msec to the value in &buf.
++ *
++ * Context:	Process context.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t blink_time_store(struct device *dev,
++				struct device_attribute *attr,
++				const char *buf, size_t count)
++{
++	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
++	unsigned int value;
++	int err;
++
++	err = kstrtouint(buf, 0, &value);
++	if (err)
++		return err;
++
++	if (value < BLKDEV_TRIG_BLINK_MIN || value > BLKDEV_TRIG_BLINK_MAX)
++		return -ERANGE;
++
++	WRITE_ONCE(btl->blink_msec, value);
++	return count;
++}
++
++/**
++ * check_interval_show() - &check_interval device attribute show function.
++ * @dev:	The LED device
++ * @attr:	The &check_interval attribute (&dev_attr_check_interval)
++ * @buf:	Output buffer
++ *
++ * Writes the value of &blkdev_trig_led.check_jiffies (converted to
++ * milliseconds) to &buf.
++ *
++ * Context:	Process context.
++ * Return:	The number of characters written to &buf.
++ */
++static ssize_t check_interval_show(struct device *dev,
++				   struct device_attribute *attr, char *buf)
++{
++	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
++
++	return sysfs_emit(buf, "%u\n",
++			  jiffies_to_msecs(READ_ONCE(btl->check_jiffies)));
++}
++
++/**
++ * check_interval_store() - &check_interval device attribute store function
++ * @dev:	The LED device
++ * @attr:	The &check_interval attribute (&dev_attr_check_interval)
++ * @buf:	The new value (as written to the &sysfs attribute)
++ * @count:	The number of characters in &buf
++ *
++ * Sets &blkdev_trig_led.check_jiffies to the value in &buf (after converting
++ * from milliseconds).
++ *
++ * Context:	Process context.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t check_interval_store(struct device *dev,
++				    struct device_attribute *attr,
++				    const char *buf, size_t count)
++{
++	struct blkdev_trig_led *led = led_trigger_get_drvdata(dev);
++	unsigned int value;
++	int err;
++
++	err = kstrtouint(buf, 0, &value);
++	if (err)
++		return err;
++
++	if (value < BLKDEV_TRIG_CHECK_MIN || value > BLKDEV_TRIG_CHECK_MAX)
++		return -ERANGE;
++
++	WRITE_ONCE(led->check_jiffies, msecs_to_jiffies(value));
++
++	return count;
++}
++
++/**
++ * blkdev_trig_mode_show() - Helper for boolean attribute show functions.
++ * @led:	The LED
++ * @buf:	Output buffer
++ * @bit:	Which bit to show
++ *
++ * Context:	Process context.
++ * Return:	The number of characters written to &buf.
++ */
++static int blkdev_trig_mode_show(const struct blkdev_trig_led *led, char *buf,
++				 enum stat_group bit)
++{
++	return sysfs_emit(buf,
++			  READ_ONCE(led->mode) & (1 << bit) ? "Y\n" : "N\n");
++}
++
++/**
++ * blkdev_trig_mode_store() - Helper for boolean attribute store functions.
++ * @led:	The LED
++ * @buf:	The new value (as written to the &sysfs attribute)
++ * @count:	The number of characters in &buf
++ * @bit:	Which bit to set
++ *
++ * Context:	Process context.
++ * Return:	&count on success, negative &errno on error.
++ */
++static int blkdev_trig_mode_store(struct blkdev_trig_led *led,
++				  const char *buf, size_t count,
++				  enum stat_group bit)
++{
++	bool set;
++	int err;
++
++	err = kstrtobool(buf, &set);
++	if (err)
++		return err;
++
++	if (set)
++		set_bit(bit, &led->mode);
++	else
++		clear_bit(bit, &led->mode);
++
++	return count;
++}
++
++/**
++ * blink_on_read_show() - &blink_on_read device attribute show function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_read attribute (&dev_attr_blink_on_read)
++ * @buf:	Output buffer
++ *
++ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_READ bit in
++ * &blkdev_trig_led.mode is set or cleared.
++ *
++ * Context:	Process context.
++ * Return:	The number of characters written to &buf.
++ */
++static ssize_t blink_on_read_show(struct device *dev,
++				  struct device_attribute *attr, char *buf)
++{
++	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
++				     buf, STAT_READ);
++}
++
++/**
++ * blink_on_read_store() - &blink_on_read device attribute store function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_read attribute (&dev_attr_blink_on_read)
++ * @buf:	The new value (as written to the &sysfs attribute)
++ * @count:	The number of characters in &buf
++ *
++ * Sets the &STAT_READ bit in &blkdev_trig_led.mode to the value in &buf
++ * (interpretted as a boolean).
++ *
++ * Context:	Process context.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t blink_on_read_store(struct device *dev,
++				   struct device_attribute *attr,
++				   const char *buf, size_t count)
++{
++	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
++				      buf, count, STAT_READ);
++}
++
++/**
++ * blink_on_write_show() - &blink_on_write device attribute show function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_write attribute (&dev_attr_blink_on_write)
++ * @buf:	Output buffer
++ *
++ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_WRITE bit in
++ * in &blkdev_trig_led.mode is set or cleared.
++ *
++ * Context:	Process context.
++ * Return:	The number of characters written to &buf.
++ */
++static ssize_t blink_on_write_show(struct device *dev,
++				   struct device_attribute *attr, char *buf)
++{
++	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
++				     buf, STAT_WRITE);
++}
++
++/**
++ * blink_on_write_store() - &blink_on_write device attribute store function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_write attribute (&dev_attr_blink_on_write)
++ * @buf:	The new value (as written to the &sysfs attribute)
++ * @count:	The number of characters in &buf
++ *
++ * Sets the &STAT_WRITE bit in &blkdev_trig_led.mode to the value in &buf
++ * (interpretted as a boolean).
++ *
++ * Context:	Process context.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t blink_on_write_store(struct device *dev,
++				    struct device_attribute *attr,
++				    const char *buf, size_t count)
++{
++	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
++				      buf, count, STAT_WRITE);
++}
++
++/**
++ * blink_on_flush_show() - &blink_on_flush device attribute show function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_flush attribute (&dev_attr_blink_on_flush)
++ * @buf:	Output buffer
++ *
++ * Writes ``Y`` or ``N`` to &buf, depending whether the &STAT_FLUSH bit in
++ * &blkdev_trig_led.mode is set or cleared.
++ *
++ * Context:	Process context.
++ * Return:	The number of characters written to &buf.
++ */
++static ssize_t blink_on_flush_show(struct device *dev,
++				   struct device_attribute *attr, char *buf)
++{
++	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
++				     buf, STAT_FLUSH);
++}
++
++/**
++ * blink_on_flush_store() - &blink_on_flush device attribute store function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_flush attribute (&dev_attr_blink_on_flush)
++ * @buf:	The new value (as written to the &sysfs attribute)
++ * @count:	The number of characters in &buf
++ *
++ * Sets the &STAT_FLUSH bit in &blkdev_trig_led.mode to the value in &buf
++ * (interpretted as a boolean).
++ *
++ * Context:	Process context.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t blink_on_flush_store(struct device *dev,
++				    struct device_attribute *attr,
++				    const char *buf, size_t count)
++{
++	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
++				      buf, count, STAT_FLUSH);
++}
++
++/**
++ * blink_on_discard_show() - &blink_on_discard device attribute show function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_discard attribute (&dev_attr_blink_on_discard)
++ * @buf:	Output buffer
++ *
++ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_DISCARD bit in
++ * &blkdev_trig_led.mode is set or cleared.
++ *
++ * Context:	Process context.
++ * Return:	The number of characters written to &buf.
++ */
++static ssize_t blink_on_discard_show(struct device *dev,
++				     struct device_attribute *attr, char *buf)
++{
++	return blkdev_trig_mode_show(led_trigger_get_drvdata(dev),
++				     buf, STAT_DISCARD);
++}
++
++/**
++ * blink_on_discard_store() - &blink_on_discard device attribute store function.
++ * @dev:	The LED device
++ * @attr:	The &blink_on_discard attribute (&dev_attr_blink_on_discard)
++ * @buf:	The new value (as written to the &sysfs attribute)
++ * @count:	The number of characters in &buf
++ *
++ * Sets the &STAT_DISCARD bit in &blkdev_trig_led.mode to the value in &buf
++ * (interpretted as a boolean).
++ *
++ * Context:	Process context.
++ * Return:	&count on success, negative &errno on error.
++ */
++static ssize_t blink_on_discard_store(struct device *dev,
++				      struct device_attribute *attr,
++				      const char *buf, size_t count)
++{
++	return blkdev_trig_mode_store(led_trigger_get_drvdata(dev),
++				      buf, count, STAT_DISCARD);
++}
++
++/* Device attributes */
++static DEVICE_ATTR_WO(link_dev_by_path);
++static DEVICE_ATTR_WO(unlink_dev_by_path);
++static DEVICE_ATTR_WO(unlink_dev_by_name);
++static DEVICE_ATTR_RW(blink_time);
++static DEVICE_ATTR_RW(check_interval);
++static DEVICE_ATTR_RW(blink_on_read);
++static DEVICE_ATTR_RW(blink_on_write);
++static DEVICE_ATTR_RW(blink_on_flush);
++static DEVICE_ATTR_RW(blink_on_discard);
++
++/* Device attributes in LED directory (/sys/class/leds/<led>/...) */
++static struct attribute *blkdev_trig_attrs[] = {
++	&dev_attr_link_dev_by_path.attr,
++	&dev_attr_unlink_dev_by_path.attr,
++	&dev_attr_unlink_dev_by_name.attr,
++	&dev_attr_blink_time.attr,
++	&dev_attr_check_interval.attr,
++	&dev_attr_blink_on_read.attr,
++	&dev_attr_blink_on_write.attr,
++	&dev_attr_blink_on_flush.attr,
++	&dev_attr_blink_on_discard.attr,
++	NULL
++};
++
++/* Unnamed attribute group == no subdirectory */
++static const struct attribute_group blkdev_trig_attr_group = {
++	.attrs	= blkdev_trig_attrs,
++};
++
++/* Attribute groups for the trigger */
++static const struct attribute_group *blkdev_trig_attr_groups[] = {
++	&blkdev_trig_attr_group,   /* /sys/class/leds/<led>/... */
++	&blkdev_trig_linked_devs,  /* /sys/class/leds/<led>/linked_devices/ */
++	NULL
++};
++
++/* Trigger registration data */
++static struct led_trigger blkdev_trig_trigger = {
++	.name		= "blkdev",
++	.activate	= blkdev_trig_activate,
++	.deactivate	= blkdev_trig_deactivate,
++	.groups		= blkdev_trig_attr_groups,
++};
++
++/**
++ * blkdev_trig_init() - Block device LED trigger initialization.
++ *
++ * Registers the ``blkdev`` LED trigger.
++ *
++ * Return:	&0 on success, negative &errno on failure.
++ */
++static int __init blkdev_trig_init(void)
++{
++	return led_trigger_register(&blkdev_trig_trigger);
++}
++module_init(blkdev_trig_init);
++
++/**
++ * blkdev_trig_exit() - Block device LED trigger module exit.
++ *
++ * Unregisters the ``blkdev`` LED trigger.
++ */
++static void __exit blkdev_trig_exit(void)
++{
++	led_trigger_unregister(&blkdev_trig_trigger);
++}
++module_exit(blkdev_trig_exit);
++
++MODULE_DESCRIPTION("Block device LED trigger");
++MODULE_AUTHOR("Ian Pilcher <arequipeno@gmail.com>");
++MODULE_LICENSE("GPL v2");
+diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
+index 4dff656af3ad..74241b2ff21e 100644
+--- a/drivers/pinctrl/pinctrl-amd.c
++++ b/drivers/pinctrl/pinctrl-amd.c
+@@ -748,7 +748,7 @@ static int amd_pinconf_get(struct pinctrl_dev *pctldev,
+ 		break;
+ 
+ 	default:
+-		dev_err(&gpio_dev->pdev->dev, "Invalid config param %04x\n",
++		dev_dbg(&gpio_dev->pdev->dev, "Invalid config param %04x\n",
+ 			param);
+ 		return -ENOTSUPP;
+ 	}
+@@ -798,7 +798,7 @@ static int amd_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin,
+ 			break;
+ 
+ 		default:
+-			dev_err(&gpio_dev->pdev->dev,
++			dev_dbg(&gpio_dev->pdev->dev,
+ 				"Invalid config param %04x\n", param);
+ 			ret = -ENOTSUPP;
+ 		}
 diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
-index e83c4c0950417..21b8dfa5d8286 100644
+index e83c4c095041..21b8dfa5d828 100644
 --- a/include/linux/pageblock-flags.h
 +++ b/include/linux/pageblock-flags.h
 @@ -48,7 +48,7 @@ extern unsigned int pageblock_order;
@@ -10280,7 +14046,7 @@ index e83c4c0950417..21b8dfa5d8286 100644
  #endif /* CONFIG_HUGETLB_PAGE */
  
 diff --git a/kernel/padata.c b/kernel/padata.c
-index 222d60195de66..b8e6b7c48746e 100644
+index 222d60195de6..b8e6b7c48746 100644
 --- a/kernel/padata.c
 +++ b/kernel/padata.c
 @@ -45,7 +45,7 @@ struct padata_mt_job_state {
@@ -10302,7 +14068,7 @@ index 222d60195de66..b8e6b7c48746e 100644
  	struct padata_work *pw = container_of(w, struct padata_work, pw_work);
  	struct padata_mt_job_state *ps = pw->pw_data;
 diff --git a/mm/readahead.c b/mm/readahead.c
-index a9c999aa19af6..797494cec4903 100644
+index a9c999aa19af..797494cec490 100644
 --- a/mm/readahead.c
 +++ b/mm/readahead.c
 @@ -613,9 +613,17 @@ static void ondemand_readahead(struct readahead_control *ractl,
@@ -10324,8 +14090,21 @@ index a9c999aa19af6..797494cec4903 100644
  		ra->start = start;
  		ra->size = start - index;	/* old async_size */
  		ra->size += req_size;
+diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o
+index 0edfdb40364b..ae52d3b3f063 100644
+--- a/scripts/Makefile.vmlinux_o
++++ b/scripts/Makefile.vmlinux_o
+@@ -19,7 +19,7 @@ quiet_cmd_gen_initcalls_lds = GEN     $@
+ 
+ .tmp_initcalls.lds: $(srctree)/scripts/generate_initcall_order.pl \
+ 		vmlinux.a $(KBUILD_VMLINUX_LIBS) FORCE
+-	$(call if_changed,gen_initcalls_lds)
++	+$(call if_changed,gen_initcalls_lds)
+ 
+ targets := .tmp_initcalls.lds
+ 
 diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c
-index ce5faa6205170..1f0f2b8df3005 100644
+index ce5faa620517..1f0f2b8df300 100644
 --- a/sound/pci/hda/cs35l41_hda.c
 +++ b/sound/pci/hda/cs35l41_hda.c
 @@ -1235,7 +1235,7 @@ static int cs35l41_no_acpi_dsd(struct cs35l41_hda *cs35l41, struct device *physd
@@ -10338,12 +14117,12 @@ index ce5faa6205170..1f0f2b8df3005 100644
  		hw_cfg->gpio1.func = CS35l41_VSPK_SWITCH;
  		hw_cfg->gpio1.valid = true;
 -- 
-2.41.0
+2.42.0
 
-From 4b328fcd2f946e4a517cd7f562482a5f0c9bbe04 Mon Sep 17 00:00:00 2001
+From e4895406f7f12e8bed1293c24931803abb1915c1 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Mon, 10 Jul 2023 17:10:25 +0200
-Subject: [PATCH 5/6] ksm
+Subject: [PATCH 6/7] ksm
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -10380,7 +14159,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  30 files changed, 390 insertions(+), 18 deletions(-)
 
 diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst
-index 7626392fe82cb..5c5be7bd84b81 100644
+index 7626392fe82c..5c5be7bd84b8 100644
 --- a/Documentation/admin-guide/mm/ksm.rst
 +++ b/Documentation/admin-guide/mm/ksm.rst
 @@ -173,6 +173,13 @@ stable_node_chains
@@ -10431,7 +14210,7 @@ index 7626392fe82cb..5c5be7bd84b81 100644
  From the perspective of application, a high ratio of ``ksm_rmap_items`` to
  ``ksm_merging_pages`` means a bad madvise-applied policy, so developers or
 diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
-index 1f13995d00d7b..4a5bc2a91fa74 100644
+index 1f13995d00d7..4a5bc2a91fa7 100644
 --- a/arch/alpha/kernel/syscalls/syscall.tbl
 +++ b/arch/alpha/kernel/syscalls/syscall.tbl
 @@ -491,3 +491,6 @@
@@ -10442,7 +14221,7 @@ index 1f13995d00d7b..4a5bc2a91fa74 100644
 +563	common	process_ksm_disable		sys_process_ksm_disable
 +564	common	process_ksm_status		sys_process_ksm_status
 diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
-index 8ebed8a138747..d616dcc060df3 100644
+index 8ebed8a13874..d616dcc060df 100644
 --- a/arch/arm/tools/syscall.tbl
 +++ b/arch/arm/tools/syscall.tbl
 @@ -465,3 +465,6 @@
@@ -10453,7 +14232,7 @@ index 8ebed8a138747..d616dcc060df3 100644
 +453	common	process_ksm_disable		sys_process_ksm_disable
 +454	common	process_ksm_status		sys_process_ksm_status
 diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
-index 64a514f90131b..63a8a9c4abc16 100644
+index 64a514f90131..63a8a9c4abc1 100644
 --- a/arch/arm64/include/asm/unistd.h
 +++ b/arch/arm64/include/asm/unistd.h
 @@ -39,7 +39,7 @@
@@ -10466,7 +14245,7 @@ index 64a514f90131b..63a8a9c4abc16 100644
  
  #define __ARCH_WANT_SYS_CLONE
 diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
-index d952a28463e01..c99c8260489b8 100644
+index d952a28463e0..c99c8260489b 100644
 --- a/arch/arm64/include/asm/unistd32.h
 +++ b/arch/arm64/include/asm/unistd32.h
 @@ -909,6 +909,12 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
@@ -10483,7 +14262,7 @@ index d952a28463e01..c99c8260489b8 100644
  /*
   * Please add new compat syscalls above this comment and update
 diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
-index f8c74ffeeefbe..735157909c6fb 100644
+index f8c74ffeeefb..735157909c6f 100644
 --- a/arch/ia64/kernel/syscalls/syscall.tbl
 +++ b/arch/ia64/kernel/syscalls/syscall.tbl
 @@ -372,3 +372,6 @@
@@ -10494,7 +14273,7 @@ index f8c74ffeeefbe..735157909c6fb 100644
 +453	common	process_ksm_disable		sys_process_ksm_disable
 +454	common	process_ksm_status		sys_process_ksm_status
 diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
-index 4f504783371fc..25b22d311f108 100644
+index 4f504783371f..25b22d311f10 100644
 --- a/arch/m68k/kernel/syscalls/syscall.tbl
 +++ b/arch/m68k/kernel/syscalls/syscall.tbl
 @@ -451,3 +451,6 @@
@@ -10505,7 +14284,7 @@ index 4f504783371fc..25b22d311f108 100644
 +453	common	process_ksm_disable		sys_process_ksm_disable
 +454	common	process_ksm_status		sys_process_ksm_status
 diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
-index 858d22bf275c2..e548c182a33ef 100644
+index 858d22bf275c..e548c182a33e 100644
 --- a/arch/microblaze/kernel/syscalls/syscall.tbl
 +++ b/arch/microblaze/kernel/syscalls/syscall.tbl
 @@ -457,3 +457,6 @@
@@ -10516,7 +14295,7 @@ index 858d22bf275c2..e548c182a33ef 100644
 +453	common	process_ksm_disable		sys_process_ksm_disable
 +454	common	process_ksm_status		sys_process_ksm_status
 diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
-index 1976317d4e8b0..fed21167be444 100644
+index 1976317d4e8b..fed21167be44 100644
 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl
 +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
 @@ -390,3 +390,6 @@
@@ -10527,7 +14306,7 @@ index 1976317d4e8b0..fed21167be444 100644
 +453	n32	process_ksm_disable		sys_process_ksm_disable
 +454	n32	process_ksm_status		sys_process_ksm_status
 diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
-index cfda2511badf3..b27ae871f676f 100644
+index cfda2511badf..b27ae871f676 100644
 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl
 +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
 @@ -366,3 +366,6 @@
@@ -10538,7 +14317,7 @@ index cfda2511badf3..b27ae871f676f 100644
 +453	n64	process_ksm_disable		sys_process_ksm_disable
 +454	n64	process_ksm_status		sys_process_ksm_status
 diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
-index 7692234c37683..59f298413c292 100644
+index 7692234c3768..59f298413c29 100644
 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl
 +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
 @@ -439,3 +439,6 @@
@@ -10549,7 +14328,7 @@ index 7692234c37683..59f298413c292 100644
 +453	o32	process_ksm_disable		sys_process_ksm_disable
 +454	o32	process_ksm_status		sys_process_ksm_status
 diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
-index a0a9145b6dd4f..494b59d1185fa 100644
+index a0a9145b6dd4..494b59d1185f 100644
 --- a/arch/parisc/kernel/syscalls/syscall.tbl
 +++ b/arch/parisc/kernel/syscalls/syscall.tbl
 @@ -450,3 +450,6 @@
@@ -10560,7 +14339,7 @@ index a0a9145b6dd4f..494b59d1185fa 100644
 +453	common	process_ksm_disable		sys_process_ksm_disable
 +454	common	process_ksm_status		sys_process_ksm_status
 diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
-index 8c0b08b7a80ec..499d7b233a431 100644
+index 8c0b08b7a80e..499d7b233a43 100644
 --- a/arch/powerpc/kernel/syscalls/syscall.tbl
 +++ b/arch/powerpc/kernel/syscalls/syscall.tbl
 @@ -538,3 +538,6 @@
@@ -10571,7 +14350,7 @@ index 8c0b08b7a80ec..499d7b233a431 100644
 +453	common	process_ksm_disable		sys_process_ksm_disable
 +454	common	process_ksm_status		sys_process_ksm_status
 diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
-index a6935af2235ca..97b36ce151556 100644
+index a6935af2235c..97b36ce15155 100644
 --- a/arch/s390/kernel/syscalls/syscall.tbl
 +++ b/arch/s390/kernel/syscalls/syscall.tbl
 @@ -454,3 +454,6 @@
@@ -10582,7 +14361,7 @@ index a6935af2235ca..97b36ce151556 100644
 +453  common	process_ksm_disable	sys_process_ksm_disable		sys_process_ksm_disable
 +454  common	process_ksm_status	sys_process_ksm_status		sys_process_ksm_status
 diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
-index 97377e8c50251..bd3827e1fc8d9 100644
+index 97377e8c5025..bd3827e1fc8d 100644
 --- a/arch/sh/kernel/syscalls/syscall.tbl
 +++ b/arch/sh/kernel/syscalls/syscall.tbl
 @@ -454,3 +454,6 @@
@@ -10593,7 +14372,7 @@ index 97377e8c50251..bd3827e1fc8d9 100644
 +453	common	process_ksm_disable		sys_process_ksm_disable
 +454	common	process_ksm_status		sys_process_ksm_status
 diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
-index faa835f3c54a5..c05e62a0ca026 100644
+index faa835f3c54a..c05e62a0ca02 100644
 --- a/arch/sparc/kernel/syscalls/syscall.tbl
 +++ b/arch/sparc/kernel/syscalls/syscall.tbl
 @@ -497,3 +497,6 @@
@@ -10604,7 +14383,7 @@ index faa835f3c54a5..c05e62a0ca026 100644
 +453	common	process_ksm_disable		sys_process_ksm_disable
 +454	common	process_ksm_status		sys_process_ksm_status
 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
-index bc0a3c941b35c..c79bd2dd758da 100644
+index bc0a3c941b35..c79bd2dd758d 100644
 --- a/arch/x86/entry/syscalls/syscall_32.tbl
 +++ b/arch/x86/entry/syscalls/syscall_32.tbl
 @@ -456,3 +456,6 @@
@@ -10615,7 +14394,7 @@ index bc0a3c941b35c..c79bd2dd758da 100644
 +453	i386	process_ksm_disable		sys_process_ksm_disable
 +454	i386	process_ksm_status		sys_process_ksm_status
 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
-index 227538b0ce801..e146a70cc299f 100644
+index 227538b0ce80..e146a70cc299 100644
 --- a/arch/x86/entry/syscalls/syscall_64.tbl
 +++ b/arch/x86/entry/syscalls/syscall_64.tbl
 @@ -373,6 +373,9 @@
@@ -10629,7 +14408,7 @@ index 227538b0ce801..e146a70cc299f 100644
  #
  # Due to a historical design error, certain syscalls are numbered differently
 diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
-index 2b69c3c035b6a..b7bf81a3ba133 100644
+index 2b69c3c035b6..b7bf81a3ba13 100644
 --- a/arch/xtensa/kernel/syscalls/syscall.tbl
 +++ b/arch/xtensa/kernel/syscalls/syscall.tbl
 @@ -422,3 +422,6 @@
@@ -10640,7 +14419,7 @@ index 2b69c3c035b6a..b7bf81a3ba133 100644
 +453	common	process_ksm_disable		sys_process_ksm_disable
 +454	common	process_ksm_status		sys_process_ksm_status
 diff --git a/fs/proc/base.c b/fs/proc/base.c
-index 9df3f48396628..0fedd00505771 100644
+index 9df3f4839662..0fedd0050577 100644
 --- a/fs/proc/base.c
 +++ b/fs/proc/base.c
 @@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
@@ -10652,7 +14431,7 @@ index 9df3f48396628..0fedd00505771 100644
  		seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm));
  		mmput(mm);
 diff --git a/include/linux/ksm.h b/include/linux/ksm.h
-index 899a314bc4872..c2dd786a30e1f 100644
+index 899a314bc487..c2dd786a30e1 100644
 --- a/include/linux/ksm.h
 +++ b/include/linux/ksm.h
 @@ -26,6 +26,22 @@ int ksm_disable(struct mm_struct *mm);
@@ -10690,7 +14469,7 @@ index 899a314bc4872..c2dd786a30e1f 100644
  static inline void collect_procs_ksm(struct page *page,
  				     struct list_head *to_kill, int force_early)
 diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
-index 5e74ce4a28cd6..51d04c1847c11 100644
+index 7d30dc4ff0ff..d8d8cc1348d6 100644
 --- a/include/linux/mm_types.h
 +++ b/include/linux/mm_types.h
 @@ -812,7 +812,7 @@ struct mm_struct {
@@ -10717,7 +14496,7 @@ index 5e74ce4a28cd6..51d04c1847c11 100644
  		struct {
  			/* this mm_struct is on lru_gen_mm_list */
 diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
-index 03e3d0121d5e3..16597dea90f40 100644
+index 03e3d0121d5e..16597dea90f4 100644
 --- a/include/linux/syscalls.h
 +++ b/include/linux/syscalls.h
 @@ -813,6 +813,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
@@ -10731,7 +14510,7 @@ index 03e3d0121d5e3..16597dea90f40 100644
  			unsigned long prot, unsigned long pgoff,
  			unsigned long flags);
 diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
-index fd6c1cb585db4..11d0fc82c4378 100644
+index fd6c1cb585db..11d0fc82c437 100644
 --- a/include/uapi/asm-generic/unistd.h
 +++ b/include/uapi/asm-generic/unistd.h
 @@ -820,8 +820,17 @@ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
@@ -10754,7 +14533,7 @@ index fd6c1cb585db4..11d0fc82c4378 100644
  /*
   * 32 bit systems traditionally used different
 diff --git a/kernel/sys.c b/kernel/sys.c
-index 2410e3999ebe5..b0841a2dd2b7a 100644
+index 2410e3999ebe..b0841a2dd2b7 100644
 --- a/kernel/sys.c
 +++ b/kernel/sys.c
 @@ -2727,6 +2727,153 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
@@ -10912,7 +14691,7 @@ index 2410e3999ebe5..b0841a2dd2b7a 100644
  		struct getcpu_cache __user *, unused)
  {
 diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
-index 781de7cc6a4e1..49a35d35d0f97 100644
+index 781de7cc6a4e..49a35d35d0f9 100644
 --- a/kernel/sys_ni.c
 +++ b/kernel/sys_ni.c
 @@ -184,6 +184,9 @@ COND_SYSCALL(mincore);
@@ -10926,7 +14705,7 @@ index 781de7cc6a4e1..49a35d35d0f97 100644
  COND_SYSCALL(mbind);
  COND_SYSCALL(get_mempolicy);
 diff --git a/mm/khugepaged.c b/mm/khugepaged.c
-index 78c8d5d8b6284..4b8b8673d5d9f 100644
+index 78c8d5d8b628..4b8b8673d5d9 100644
 --- a/mm/khugepaged.c
 +++ b/mm/khugepaged.c
 @@ -19,6 +19,7 @@
@@ -10946,7 +14725,7 @@ index 78c8d5d8b6284..4b8b8673d5d9f 100644
  		} else {
  			src_page = pte_page(pteval);
 diff --git a/mm/ksm.c b/mm/ksm.c
-index d20d7662419be..74804158ee02d 100644
+index d7b5b95e936e..6b7b8928fb96 100644
 --- a/mm/ksm.c
 +++ b/mm/ksm.c
 @@ -278,6 +278,9 @@ static unsigned int zero_checksum __read_mostly;
@@ -10969,7 +14748,7 @@ index d20d7662419be..74804158ee02d 100644
  	pte_unmap_unlock(pte, ptl);
  	return ret;
  }
-@@ -1222,8 +1226,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
+@@ -1229,8 +1233,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
  		page_add_anon_rmap(kpage, vma, addr, RMAP_NONE);
  		newpte = mk_pte(kpage, vma->vm_page_prot);
  	} else {
@@ -10986,7 +14765,7 @@ index d20d7662419be..74804158ee02d 100644
  		/*
  		 * We're replacing an anonymous page with a zero page, which is
  		 * not anonymous. We need to do proper accounting otherwise we
-@@ -3084,7 +3094,7 @@ static void wait_while_offlining(void)
+@@ -3091,7 +3101,7 @@ static void wait_while_offlining(void)
  #ifdef CONFIG_PROC_FS
  long ksm_process_profit(struct mm_struct *mm)
  {
@@ -10995,7 +14774,7 @@ index d20d7662419be..74804158ee02d 100644
  		mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
  }
  #endif /* CONFIG_PROC_FS */
-@@ -3353,12 +3363,19 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
+@@ -3360,12 +3370,19 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
  }
  KSM_ATTR_RO(pages_volatile);
  
@@ -11016,7 +14795,7 @@ index d20d7662419be..74804158ee02d 100644
  				ksm_rmap_items * sizeof(struct ksm_rmap_item);
  
  	return sysfs_emit(buf, "%ld\n", general_profit);
-@@ -3420,6 +3437,7 @@ static struct attribute *ksm_attrs[] = {
+@@ -3427,6 +3444,7 @@ static struct attribute *ksm_attrs[] = {
  	&pages_sharing_attr.attr,
  	&pages_unshared_attr.attr,
  	&pages_volatile_attr.attr,
@@ -11025,7 +14804,7 @@ index d20d7662419be..74804158ee02d 100644
  #ifdef CONFIG_NUMA
  	&merge_across_nodes_attr.attr,
 diff --git a/mm/memory.c b/mm/memory.c
-index 1ec1ef3418bf5..014dd58b3ffe9 100644
+index cdc4d4c1c858..428943ecda25 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
 @@ -1433,8 +1433,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -11049,7 +14828,7 @@ index 1ec1ef3418bf5..014dd58b3ffe9 100644
  		}
  		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
 diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
-index 26853badae705..0de9d33cd565d 100644
+index 26853badae70..0de9d33cd565 100644
 --- a/tools/testing/selftests/mm/ksm_functional_tests.c
 +++ b/tools/testing/selftests/mm/ksm_functional_tests.c
 @@ -29,6 +29,8 @@
@@ -11188,12 +14967,12 @@ index 26853badae705..0de9d33cd565d 100644
  #ifdef __NR_userfaultfd
  	test_unmerge_uffd_wp();
 -- 
-2.41.0
+2.42.0
 
-From e376a8aadd07d72875ff77bfc6c3d2ba9ac549bd Mon Sep 17 00:00:00 2001
+From 49274c8196e04f14f8af83a59ff82e2ae00ac21b Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Mon, 10 Jul 2023 17:11:55 +0200
-Subject: [PATCH 6/6] zstd
+Subject: [PATCH 7/7] zstd
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -11260,7 +15039,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  create mode 100644 lib/zstd/common/bits.h
 
 diff --git a/include/linux/zstd.h b/include/linux/zstd.h
-index 113408eef6ece..f109d49f43f80 100644
+index 113408eef6ec..f109d49f43f8 100644
 --- a/include/linux/zstd.h
 +++ b/include/linux/zstd.h
 @@ -1,6 +1,6 @@
@@ -11272,7 +15051,7 @@ index 113408eef6ece..f109d49f43f80 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h
-index 58b6dd45a969f..6d5cf55f0bf3e 100644
+index 58b6dd45a969..6d5cf55f0bf3 100644
 --- a/include/linux/zstd_errors.h
 +++ b/include/linux/zstd_errors.h
 @@ -1,5 +1,6 @@
@@ -11338,7 +15117,7 @@ index 58b6dd45a969f..6d5cf55f0bf3e 100644
  } ZSTD_ErrorCode;
  
 diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
-index 79d55465d5c1d..8b4ffe649df57 100644
+index 79d55465d5c1..8b4ffe649df5 100644
 --- a/include/linux/zstd_lib.h
 +++ b/include/linux/zstd_lib.h
 @@ -1,5 +1,6 @@
@@ -12530,7 +16309,7 @@ index 79d55465d5c1d..8b4ffe649df57 100644
  #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
  
 diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
-index 20f08c644b71a..464c410b2768c 100644
+index 20f08c644b71..464c410b2768 100644
 --- a/lib/zstd/Makefile
 +++ b/lib/zstd/Makefile
 @@ -1,6 +1,6 @@
@@ -12543,7 +16322,7 @@ index 20f08c644b71a..464c410b2768c 100644
  # This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h
 new file mode 100644
-index 0000000000000..05adbbeccaa9b
+index 000000000000..05adbbeccaa9
 --- /dev/null
 +++ b/lib/zstd/common/allocations.h
 @@ -0,0 +1,56 @@
@@ -12605,7 +16384,7 @@ index 0000000000000..05adbbeccaa9b
 +#endif /* ZSTD_ALLOCATIONS_H */
 diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h
 new file mode 100644
-index 0000000000000..aa3487ec4b6a7
+index 000000000000..aa3487ec4b6a
 --- /dev/null
 +++ b/lib/zstd/common/bits.h
 @@ -0,0 +1,149 @@
@@ -12759,7 +16538,7 @@ index 0000000000000..aa3487ec4b6a7
 +
 +#endif /* ZSTD_BITS_H */
 diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
-index feef3a1b1d600..444dc4f85c649 100644
+index feef3a1b1d60..444dc4f85c64 100644
 --- a/lib/zstd/common/bitstream.h
 +++ b/lib/zstd/common/bitstream.h
 @@ -1,7 +1,8 @@
@@ -12886,7 +16665,7 @@ index feef3a1b1d600..444dc4f85c649 100644
      if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
          return BIT_DStream_overflow;
 diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
-index c42d39faf9bd8..c437e09755750 100644
+index c42d39faf9bd..c437e0975575 100644
 --- a/lib/zstd/common/compiler.h
 +++ b/lib/zstd/common/compiler.h
 @@ -1,5 +1,6 @@
@@ -12916,7 +16695,7 @@ index c42d39faf9bd8..c437e09755750 100644
  
  #endif /* ZSTD_COMPILER_H */
 diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h
-index 0db7b42407eea..d8319a2bef4ce 100644
+index 0db7b42407ee..d8319a2bef4c 100644
 --- a/lib/zstd/common/cpu.h
 +++ b/lib/zstd/common/cpu.h
 @@ -1,5 +1,6 @@
@@ -12928,7 +16707,7 @@ index 0db7b42407eea..d8319a2bef4ce 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c
-index bb863c9ea6164..e56ff6464e918 100644
+index bb863c9ea616..e56ff6464e91 100644
 --- a/lib/zstd/common/debug.c
 +++ b/lib/zstd/common/debug.c
 @@ -1,7 +1,8 @@
@@ -12942,7 +16721,7 @@ index bb863c9ea6164..e56ff6464e918 100644
   * You can contact the author at :
   * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
 diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h
-index 6dd88d1fbd02c..da0dbfc614b88 100644
+index 6dd88d1fbd02..da0dbfc614b8 100644
 --- a/lib/zstd/common/debug.h
 +++ b/lib/zstd/common/debug.h
 @@ -1,7 +1,8 @@
@@ -12956,7 +16735,7 @@ index 6dd88d1fbd02c..da0dbfc614b88 100644
   * You can contact the author at :
   * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
 diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c
-index fef67056f0524..6cdd82233fb59 100644
+index fef67056f052..6cdd82233fb5 100644
 --- a/lib/zstd/common/entropy_common.c
 +++ b/lib/zstd/common/entropy_common.c
 @@ -1,6 +1,7 @@
@@ -13074,7 +16853,7 @@ index fef67056f0524..6cdd82233fb59 100644
      return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
  }
 diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c
-index 6d1135f8c3733..a4062d30d1703 100644
+index 6d1135f8c373..a4062d30d170 100644
 --- a/lib/zstd/common/error_private.c
 +++ b/lib/zstd/common/error_private.c
 @@ -1,5 +1,6 @@
@@ -13122,7 +16901,7 @@ index 6d1135f8c3733..a4062d30d1703 100644
      default: return notErrorCode;
      }
 diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h
-index ca5101e542faa..9a4699a38a881 100644
+index ca5101e542fa..9a4699a38a88 100644
 --- a/lib/zstd/common/error_private.h
 +++ b/lib/zstd/common/error_private.h
 @@ -1,5 +1,6 @@
@@ -13134,7 +16913,7 @@ index ca5101e542faa..9a4699a38a881 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
-index 4507043b2287c..c4e25a2191429 100644
+index 4507043b2287..c4e25a219142 100644
 --- a/lib/zstd/common/fse.h
 +++ b/lib/zstd/common/fse.h
 @@ -1,7 +1,8 @@
@@ -13286,7 +17065,7 @@ index 4507043b2287c..c4e25a2191429 100644
   * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
  MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
 diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c
-index a0d06095be83d..45cf457f31ef8 100644
+index a0d06095be83..45cf457f31ef 100644
 --- a/lib/zstd/common/fse_decompress.c
 +++ b/lib/zstd/common/fse_decompress.c
 @@ -1,6 +1,7 @@
@@ -13446,7 +17225,7 @@ index a0d06095be83d..45cf457f31ef8 100644
 -
  #endif   /* FSE_COMMONDEFS_ONLY */
 diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
-index 5042ff8703087..8e7943092ed1a 100644
+index 5042ff870308..8e7943092ed1 100644
 --- a/lib/zstd/common/huf.h
 +++ b/lib/zstd/common/huf.h
 @@ -1,7 +1,8 @@
@@ -13773,7 +17552,7 @@ index 5042ff8703087..8e7943092ed1a 100644
 +#endif   /* HUF_H_298734234 */
  
 diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
-index 1d9cc03924ca9..a7231822b6e32 100644
+index 1d9cc03924ca..a7231822b6e3 100644
 --- a/lib/zstd/common/mem.h
 +++ b/lib/zstd/common/mem.h
 @@ -1,6 +1,6 @@
@@ -13785,7 +17564,7 @@ index 1d9cc03924ca9..a7231822b6e32 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h
-index 0e3b2c0a527db..7ede8cf1ffe57 100644
+index 0e3b2c0a527d..7ede8cf1ffe5 100644
 --- a/lib/zstd/common/portability_macros.h
 +++ b/lib/zstd/common/portability_macros.h
 @@ -1,5 +1,6 @@
@@ -13839,7 +17618,7 @@ index 0e3b2c0a527db..7ede8cf1ffe57 100644
 +
  #endif /* ZSTD_PORTABILITY_MACROS_H */
 diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
-index 3d7e35b309b5d..44b95b25344a1 100644
+index 3d7e35b309b5..44b95b25344a 100644
 --- a/lib/zstd/common/zstd_common.c
 +++ b/lib/zstd/common/zstd_common.c
 @@ -1,5 +1,6 @@
@@ -13897,7 +17676,7 @@ index 3d7e35b309b5d..44b95b25344a1 100644
 -    }
 -}
 diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h
-index 2c34e8a33a1c1..670c5fa2a952d 100644
+index 2c34e8a33a1c..670c5fa2a952 100644
 --- a/lib/zstd/common/zstd_deps.h
 +++ b/lib/zstd/common/zstd_deps.h
 @@ -1,6 +1,6 @@
@@ -13931,7 +17710,7 @@ index 2c34e8a33a1c1..670c5fa2a952d 100644
 +#endif /* ZSTD_DEPS_STDINT */
 +#endif /* ZSTD_DEPS_NEED_STDINT */
 diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h
-index 93305d9b41bba..7f023e4d47740 100644
+index 93305d9b41bb..7f023e4d4774 100644
 --- a/lib/zstd/common/zstd_internal.h
 +++ b/lib/zstd/common/zstd_internal.h
 @@ -1,5 +1,6 @@
@@ -14117,7 +17896,7 @@ index 93305d9b41bba..7f023e4d47740 100644
  
  /* ZSTD_invalidateRepCodes() :
 diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
-index d9a76112ec3af..6ab8be6532efc 100644
+index d9a76112ec3a..6ab8be6532ef 100644
 --- a/lib/zstd/compress/clevels.h
 +++ b/lib/zstd/compress/clevels.h
 @@ -1,5 +1,6 @@
@@ -14129,7 +17908,7 @@ index d9a76112ec3af..6ab8be6532efc 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c
-index ec5b1ca6d71af..e46ca6621b488 100644
+index ec5b1ca6d71a..e46ca6621b48 100644
 --- a/lib/zstd/compress/fse_compress.c
 +++ b/lib/zstd/compress/fse_compress.c
 @@ -1,6 +1,7 @@
@@ -14248,7 +18027,7 @@ index ec5b1ca6d71af..e46ca6621b488 100644
 -
  #endif   /* FSE_COMMONDEFS_ONLY */
 diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c
-index 3ddc6dfb68948..0b12587cc14b1 100644
+index 3ddc6dfb6894..0b12587cc14b 100644
 --- a/lib/zstd/compress/hist.c
 +++ b/lib/zstd/compress/hist.c
 @@ -1,7 +1,8 @@
@@ -14262,7 +18041,7 @@ index 3ddc6dfb68948..0b12587cc14b1 100644
   *  You can contact the author at :
   *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
 diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h
-index fc1830abc9c63..f7687b0fc20a0 100644
+index fc1830abc9c6..f7687b0fc20a 100644
 --- a/lib/zstd/compress/hist.h
 +++ b/lib/zstd/compress/hist.h
 @@ -1,7 +1,8 @@
@@ -14276,7 +18055,7 @@ index fc1830abc9c63..f7687b0fc20a0 100644
   *  You can contact the author at :
   *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
 diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c
-index 74ef0db476210..83241abafe35e 100644
+index 74ef0db47621..83241abafe35 100644
 --- a/lib/zstd/compress/huf_compress.c
 +++ b/lib/zstd/compress/huf_compress.c
 @@ -1,6 +1,7 @@
@@ -15033,7 +18812,7 @@ index 74ef0db476210..83241abafe35e 100644
  }
 -
 diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c
-index f620cafca633b..c1c316e9e289f 100644
+index f620cafca633..c1c316e9e289 100644
 --- a/lib/zstd/compress/zstd_compress.c
 +++ b/lib/zstd/compress/zstd_compress.c
 @@ -1,5 +1,6 @@
@@ -18415,7 +22194,7 @@ index f620cafca633b..c1c316e9e289f 100644
 +    }
 +}
 diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h
-index 71697a11ae305..899f5e2de8e96 100644
+index 71697a11ae30..899f5e2de8e9 100644
 --- a/lib/zstd/compress/zstd_compress_internal.h
 +++ b/lib/zstd/compress/zstd_compress_internal.h
 @@ -1,5 +1,6 @@
@@ -18969,7 +22748,7 @@ index 71697a11ae305..899f5e2de8e96 100644
 +
  #endif /* ZSTD_COMPRESS_H */
 diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c
-index 52b0a8059aba9..3e9ea46a670a6 100644
+index 52b0a8059aba..3e9ea46a670a 100644
 --- a/lib/zstd/compress/zstd_compress_literals.c
 +++ b/lib/zstd/compress/zstd_compress_literals.c
 @@ -1,5 +1,6 @@
@@ -19211,7 +22990,7 @@ index 52b0a8059aba9..3e9ea46a670a6 100644
              MEM_writeLE32(ostart, lhc);
              ostart[4] = (BYTE)(cLitSize >> 10);
 diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h
-index 9775fb97cb702..a2a85d6b69e53 100644
+index 9775fb97cb70..a2a85d6b69e5 100644
 --- a/lib/zstd/compress/zstd_compress_literals.h
 +++ b/lib/zstd/compress/zstd_compress_literals.h
 @@ -1,5 +1,6 @@
@@ -19255,7 +23034,7 @@ index 9775fb97cb702..a2a85d6b69e53 100644
  
  #endif /* ZSTD_COMPRESS_LITERALS_H */
 diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c
-index 21ddc1b37acf8..5c028c78d889b 100644
+index 21ddc1b37acf..5c028c78d889 100644
 --- a/lib/zstd/compress/zstd_compress_sequences.c
 +++ b/lib/zstd/compress/zstd_compress_sequences.c
 @@ -1,5 +1,6 @@
@@ -19285,7 +23064,7 @@ index 21ddc1b37acf8..5c028c78d889b 100644
               * If basic encoding isn't possible, always choose RLE.
               */
 diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h
-index 7991364c2f71f..7fe6f4ff5cf25 100644
+index 7991364c2f71..7fe6f4ff5cf2 100644
 --- a/lib/zstd/compress/zstd_compress_sequences.h
 +++ b/lib/zstd/compress/zstd_compress_sequences.h
 @@ -1,5 +1,6 @@
@@ -19297,7 +23076,7 @@ index 7991364c2f71f..7fe6f4ff5cf25 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c
-index 17d836cc84e8f..dbacbaf727338 100644
+index 17d836cc84e8..dbacbaf72733 100644
 --- a/lib/zstd/compress/zstd_compress_superblock.c
 +++ b/lib/zstd/compress/zstd_compress_superblock.c
 @@ -1,5 +1,6 @@
@@ -19396,7 +23175,7 @@ index 17d836cc84e8f..dbacbaf727338 100644
              ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
          }
 diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h
-index 224ece79546eb..826bbc9e029b1 100644
+index 224ece79546e..826bbc9e029b 100644
 --- a/lib/zstd/compress/zstd_compress_superblock.h
 +++ b/lib/zstd/compress/zstd_compress_superblock.h
 @@ -1,5 +1,6 @@
@@ -19408,7 +23187,7 @@ index 224ece79546eb..826bbc9e029b1 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
-index 349fc923c355a..65ea53b628447 100644
+index 349fc923c355..65ea53b62844 100644
 --- a/lib/zstd/compress/zstd_cwksp.h
 +++ b/lib/zstd/compress/zstd_cwksp.h
 @@ -1,5 +1,6 @@
@@ -19713,7 +23492,7 @@ index 349fc923c355a..65ea53b628447 100644
  
  
 diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c
-index 76933dea2624e..ab9440a996039 100644
+index 76933dea2624..ab9440a99603 100644
 --- a/lib/zstd/compress/zstd_double_fast.c
 +++ b/lib/zstd/compress/zstd_double_fast.c
 @@ -1,5 +1,6 @@
@@ -20026,7 +23805,7 @@ index 76933dea2624e..ab9440a996039 100644
                      hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
                      ip += repLength2;
 diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h
-index 6822bde65a1d8..0204f12e4cf70 100644
+index 6822bde65a1d..0204f12e4cf7 100644
 --- a/lib/zstd/compress/zstd_double_fast.h
 +++ b/lib/zstd/compress/zstd_double_fast.h
 @@ -1,5 +1,6 @@
@@ -20048,7 +23827,7 @@ index 6822bde65a1d8..0204f12e4cf70 100644
          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
          void const* src, size_t srcSize);
 diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
-index a752e6beab52e..3399b39c5dbc5 100644
+index a752e6beab52..3399b39c5dbc 100644
 --- a/lib/zstd/compress/zstd_fast.c
 +++ b/lib/zstd/compress/zstd_fast.c
 @@ -1,5 +1,6 @@
@@ -20831,7 +24610,7 @@ index a752e6beab52e..3399b39c5dbc5 100644
      {
      default: /* includes case 3 */
 diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h
-index fddc2f532d21d..e64d9e1b2d393 100644
+index fddc2f532d21..e64d9e1b2d39 100644
 --- a/lib/zstd/compress/zstd_fast.h
 +++ b/lib/zstd/compress/zstd_fast.h
 @@ -1,5 +1,6 @@
@@ -20853,7 +24632,7 @@ index fddc2f532d21d..e64d9e1b2d393 100644
          ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
          void const* src, size_t srcSize);
 diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
-index 0298a01a7504a..f6b4978ceba7f 100644
+index 0298a01a7504..f6b4978ceba7 100644
 --- a/lib/zstd/compress/zstd_lazy.c
 +++ b/lib/zstd/compress/zstd_lazy.c
 @@ -1,5 +1,6 @@
@@ -21916,7 +25695,7 @@ index 0298a01a7504a..f6b4978ceba7f 100644
      return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2);
  }
 diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
-index e5bdf4df8dde0..9505bed93c031 100644
+index e5bdf4df8dde..9505bed93c03 100644
 --- a/lib/zstd/compress/zstd_lazy.h
 +++ b/lib/zstd/compress/zstd_lazy.h
 @@ -1,5 +1,6 @@
@@ -21946,7 +25725,7 @@ index e5bdf4df8dde0..9505bed93c031 100644
  
  #endif /* ZSTD_LAZY_H */
 diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
-index dd86fc83e7dde..b7da76b0db7c4 100644
+index dd86fc83e7dd..b7da76b0db7c 100644
 --- a/lib/zstd/compress/zstd_ldm.c
 +++ b/lib/zstd/compress/zstd_ldm.c
 @@ -1,5 +1,6 @@
@@ -21990,7 +25769,7 @@ index dd86fc83e7dde..b7da76b0db7c4 100644
              ip += sequence.matchLength;
          }
 diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h
-index fbc6a5e88fd7a..c540731abde72 100644
+index fbc6a5e88fd7..c540731abde7 100644
 --- a/lib/zstd/compress/zstd_ldm.h
 +++ b/lib/zstd/compress/zstd_ldm.h
 @@ -1,5 +1,6 @@
@@ -22002,7 +25781,7 @@ index fbc6a5e88fd7a..c540731abde72 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h
-index 647f865be2903..cfccfc46f6f7b 100644
+index 647f865be290..cfccfc46f6f7 100644
 --- a/lib/zstd/compress/zstd_ldm_geartab.h
 +++ b/lib/zstd/compress/zstd_ldm_geartab.h
 @@ -1,5 +1,6 @@
@@ -22014,7 +25793,7 @@ index 647f865be2903..cfccfc46f6f7b 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
-index fd82acfda62f6..1e41cb04f4820 100644
+index fd82acfda62f..1e41cb04f482 100644
 --- a/lib/zstd/compress/zstd_opt.c
 +++ b/lib/zstd/compress/zstd_opt.c
 @@ -1,5 +1,6 @@
@@ -22496,7 +26275,7 @@ index fd82acfda62f6..1e41cb04f4820 100644
          ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
      }
 diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h
-index 22b862858ba7a..faa73ff4b03dc 100644
+index 22b862858ba7..faa73ff4b03d 100644
 --- a/lib/zstd/compress/zstd_opt.h
 +++ b/lib/zstd/compress/zstd_opt.h
 @@ -1,5 +1,6 @@
@@ -22508,7 +26287,7 @@ index 22b862858ba7a..faa73ff4b03dc 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c
-index 60958afebc415..d172e35fbd9a6 100644
+index 60958afebc41..d172e35fbd9a 100644
 --- a/lib/zstd/decompress/huf_decompress.c
 +++ b/lib/zstd/decompress/huf_decompress.c
 @@ -1,7 +1,8 @@
@@ -23699,7 +27478,7 @@ index 60958afebc415..d172e35fbd9a6 100644
  }
 -
 diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c
-index dbbc7919de534..30ef65e1ab5ca 100644
+index dbbc7919de53..30ef65e1ab5c 100644
 --- a/lib/zstd/decompress/zstd_ddict.c
 +++ b/lib/zstd/decompress/zstd_ddict.c
 @@ -1,5 +1,6 @@
@@ -23741,7 +27520,7 @@ index dbbc7919de534..30ef65e1ab5ca 100644
 +    return ddict->dictID;
  }
 diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h
-index 8c1a79d666f89..de459a0dacd19 100644
+index 8c1a79d666f8..de459a0dacd1 100644
 --- a/lib/zstd/decompress/zstd_ddict.h
 +++ b/lib/zstd/decompress/zstd_ddict.h
 @@ -1,5 +1,6 @@
@@ -23753,7 +27532,7 @@ index 8c1a79d666f89..de459a0dacd19 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c
-index 6b3177c947114..03dbdf39109f9 100644
+index 6b3177c94711..03dbdf39109f 100644
 --- a/lib/zstd/decompress/zstd_decompress.c
 +++ b/lib/zstd/decompress/zstd_decompress.c
 @@ -1,5 +1,6 @@
@@ -24310,7 +28089,7 @@ index 6b3177c947114..03dbdf39109f9 100644
 +    }
  }
 diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c
-index c1913b8e7c897..9f5577e5bc19d 100644
+index c1913b8e7c89..9f5577e5bc19 100644
 --- a/lib/zstd/decompress/zstd_decompress_block.c
 +++ b/lib/zstd/decompress/zstd_decompress_block.c
 @@ -1,5 +1,6 @@
@@ -24847,7 +28626,7 @@ index c1913b8e7c897..9f5577e5bc19d 100644
 +    return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize);
 +}
 diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h
-index 3d2d57a5d25a7..5888e6cc788b5 100644
+index 3d2d57a5d25a..5888e6cc788b 100644
 --- a/lib/zstd/decompress/zstd_decompress_block.h
 +++ b/lib/zstd/decompress/zstd_decompress_block.h
 @@ -1,5 +1,6 @@
@@ -24870,7 +28649,7 @@ index 3d2d57a5d25a7..5888e6cc788b5 100644
  
  #endif /* ZSTD_DEC_BLOCK_H */
 diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h
-index 98102edb6a832..32f79fb2873df 100644
+index 98102edb6a83..32f79fb2873d 100644
 --- a/lib/zstd/decompress/zstd_decompress_internal.h
 +++ b/lib/zstd/decompress/zstd_decompress_internal.h
 @@ -1,5 +1,6 @@
@@ -24905,7 +28684,7 @@ index 98102edb6a832..32f79fb2873df 100644
      /* streaming */
      ZSTD_dStreamStage streamStage;
 diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h
-index a06ca187aab5f..8a47eb2a45145 100644
+index a06ca187aab5..8a47eb2a4514 100644
 --- a/lib/zstd/decompress_sources.h
 +++ b/lib/zstd/decompress_sources.h
 @@ -1,6 +1,6 @@
@@ -24917,7 +28696,7 @@ index a06ca187aab5f..8a47eb2a45145 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
-index 22686e367e6f0..466828e357525 100644
+index 22686e367e6f..466828e35752 100644
 --- a/lib/zstd/zstd_common_module.c
 +++ b/lib/zstd/zstd_common_module.c
 @@ -1,6 +1,6 @@
@@ -24939,7 +28718,7 @@ index 22686e367e6f0..466828e357525 100644
  MODULE_LICENSE("Dual BSD/GPL");
  MODULE_DESCRIPTION("Zstd Common");
 diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c
-index 04e1b5c01d9b6..8ecf43226af2f 100644
+index 04e1b5c01d9b..8ecf43226af2 100644
 --- a/lib/zstd/zstd_compress_module.c
 +++ b/lib/zstd/zstd_compress_module.c
 @@ -1,6 +1,6 @@
@@ -24951,7 +28730,7 @@ index 04e1b5c01d9b6..8ecf43226af2f 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c
-index f4ed952ed4852..eb1c49e69722f 100644
+index f4ed952ed485..eb1c49e69722 100644
 --- a/lib/zstd/zstd_decompress_module.c
 +++ b/lib/zstd/zstd_decompress_module.c
 @@ -1,6 +1,6 @@
@@ -24963,4 +28742,4 @@ index f4ed952ed4852..eb1c49e69722f 100644
   *
   * This source code is licensed under both the BSD-style license (found in the
 -- 
-2.41.0
+2.42.0
diff --git a/patches/0002-eevdf.patch b/patches/0002-eevdf.patch
index bcda337..710de28 100644
--- a/patches/0002-eevdf.patch
+++ b/patches/0002-eevdf.patch
@@ -1,26 +1,27 @@
-From 6d15f875cb0c7fd65fc422c0545d57fc2e124f7c Mon Sep 17 00:00:00 2001
+From 9a3788351b1bc830a28d7a51740d2ee964ab8319 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Sun, 20 Aug 2023 15:56:13 +0200
-Subject: [PATCH] EEVDF-cachy
+Date: Mon, 28 Aug 2023 14:04:00 +0200
+Subject: [PATCH] EEVDF
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- Documentation/admin-guide/cgroup-v2.rst |   10 +
- include/linux/rbtree_augmented.h        |   26 +
- include/linux/sched.h                   |    8 +-
- include/uapi/linux/sched.h              |    4 +-
- include/uapi/linux/sched/types.h        |   19 +
- init/init_task.c                        |    3 +-
- kernel/sched/core.c                     |   65 +-
- kernel/sched/debug.c                    |   49 +-
- kernel/sched/fair.c                     | 1150 +++++++++++------------
- kernel/sched/features.h                 |   24 +-
- kernel/sched/sched.h                    |   21 +-
- tools/include/uapi/linux/sched.h        |    4 +-
- 12 files changed, 715 insertions(+), 668 deletions(-)
+ Documentation/admin-guide/cgroup-v2.rst      |   10 +
+ Documentation/scheduler/sched-design-CFS.rst |    2 +-
+ include/linux/rbtree_augmented.h             |   26 +
+ include/linux/sched.h                        |    8 +-
+ include/uapi/linux/sched.h                   |    4 +-
+ include/uapi/linux/sched/types.h             |   19 +
+ init/init_task.c                             |    3 +-
+ kernel/sched/core.c                          |   65 +-
+ kernel/sched/debug.c                         |   49 +-
+ kernel/sched/fair.c                          | 1150 ++++++++----------
+ kernel/sched/features.h                      |   24 +-
+ kernel/sched/sched.h                         |   21 +-
+ tools/include/uapi/linux/sched.h             |    4 +-
+ 13 files changed, 716 insertions(+), 669 deletions(-)
 
 diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
-index 4ef8901911961..3a8d3e1e55910 100644
+index 4ef890191196..3a8d3e1e5591 100644
 --- a/Documentation/admin-guide/cgroup-v2.rst
 +++ b/Documentation/admin-guide/cgroup-v2.rst
 @@ -1121,6 +1121,16 @@ All time durations are in microseconds.
@@ -40,8 +41,21 @@ index 4ef8901911961..3a8d3e1e55910 100644
  
  
  Memory
+diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst
+index 03db55504515..f68919800f05 100644
+--- a/Documentation/scheduler/sched-design-CFS.rst
++++ b/Documentation/scheduler/sched-design-CFS.rst
+@@ -94,7 +94,7 @@ other HZ detail.  Thus the CFS scheduler has no notion of "timeslices" in the
+ way the previous scheduler had, and has no heuristics whatsoever.  There is
+ only one central tunable (you have to switch on CONFIG_SCHED_DEBUG):
+ 
+-   /sys/kernel/debug/sched/min_granularity_ns
++   /sys/kernel/debug/sched/base_slice_ns
+ 
+ which can be used to tune the scheduler from "desktop" (i.e., low latencies) to
+ "server" (i.e., good batching) workloads.  It defaults to a setting suitable
 diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
-index 7ee7ed5de7227..6dbc5a1bf6a8c 100644
+index 7ee7ed5de722..6dbc5a1bf6a8 100644
 --- a/include/linux/rbtree_augmented.h
 +++ b/include/linux/rbtree_augmented.h
 @@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node,
@@ -78,7 +92,7 @@ index 7ee7ed5de7227..6dbc5a1bf6a8c 100644
   * Template for declaring augmented rbtree callbacks (generic case)
   *
 diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 609bde814cb06..c940c4dc83048 100644
+index 609bde814cb0..c940c4dc8304 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -549,13 +549,18 @@ struct sched_entity {
@@ -110,7 +124,7 @@ index 609bde814cb06..c940c4dc83048 100644
  	struct sched_entity		se;
  	struct sched_rt_entity		rt;
 diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
-index 3bac0a8ceab26..b2e932c25be62 100644
+index 3bac0a8ceab2..b2e932c25be6 100644
 --- a/include/uapi/linux/sched.h
 +++ b/include/uapi/linux/sched.h
 @@ -132,6 +132,7 @@ struct clone_args {
@@ -131,7 +145,7 @@ index 3bac0a8ceab26..b2e932c25be62 100644
  
  #endif /* _UAPI_LINUX_SCHED_H */
 diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
-index f2c4589d4dbfe..db1e8199e8c80 100644
+index f2c4589d4dbf..db1e8199e8c8 100644
 --- a/include/uapi/linux/sched/types.h
 +++ b/include/uapi/linux/sched/types.h
 @@ -10,6 +10,7 @@ struct sched_param {
@@ -175,7 +189,7 @@ index f2c4589d4dbfe..db1e8199e8c80 100644
  
  #endif /* _UAPI_LINUX_SCHED_TYPES_H */
 diff --git a/init/init_task.c b/init/init_task.c
-index ff6c4b9bfe6b1..511cbcf3510dc 100644
+index ff6c4b9bfe6b..511cbcf3510d 100644
 --- a/init/init_task.c
 +++ b/init/init_task.c
 @@ -78,6 +78,7 @@ struct task_struct init_task
@@ -196,7 +210,7 @@ index ff6c4b9bfe6b1..511cbcf3510dc 100644
  	.rt		= {
  		.run_list	= LIST_HEAD_INIT(init_task.rt.run_list),
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index c52c2eba7c739..aff81e12460ed 100644
+index c52c2eba7c73..aff81e12460e 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
 @@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load)
@@ -358,7 +372,7 @@ index c52c2eba7c739..aff81e12460ed 100644
  #ifdef CONFIG_CFS_BANDWIDTH
  	{
 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index 066ff1c8ae4eb..e7e83181fbb6c 100644
+index 066ff1c8ae4e..e7e83181fbb6 100644
 --- a/kernel/sched/debug.c
 +++ b/kernel/sched/debug.c
 @@ -347,10 +347,7 @@ static __init int sched_init_debug(void)
@@ -462,7 +476,7 @@ index 066ff1c8ae4eb..e7e83181fbb6c 100644
  		P(dl.runtime);
  		P(dl.deadline);
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 2c335df301718..e0a4c13dab04f 100644
+index 2c335df30171..e0a4c13dab04 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
 @@ -47,6 +47,7 @@
@@ -2075,7 +2089,7 @@ index 2c335df301718..e0a4c13dab04f 100644
  	return rr_interval;
  }
 diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index ee7f23c76bd33..546d212ef40d8 100644
+index ee7f23c76bd3..546d212ef40d 100644
 --- a/kernel/sched/features.h
 +++ b/kernel/sched/features.h
 @@ -1,16 +1,12 @@
@@ -2122,7 +2136,7 @@ index ee7f23c76bd33..546d212ef40d8 100644
 -SCHED_FEAT(ALT_PERIOD, true)
 -SCHED_FEAT(BASE_SLICE, true)
 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
-index e93e006a942b9..67cd7e1fd5016 100644
+index e93e006a942b..67cd7e1fd501 100644
 --- a/kernel/sched/sched.h
 +++ b/kernel/sched/sched.h
 @@ -372,6 +372,8 @@ struct task_group {
@@ -2202,7 +2216,7 @@ index e93e006a942b9..67cd7e1fd5016 100644
 +
  #endif /* _KERNEL_SCHED_SCHED_H */
 diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
-index 3bac0a8ceab26..b2e932c25be62 100644
+index 3bac0a8ceab2..b2e932c25be6 100644
 --- a/tools/include/uapi/linux/sched.h
 +++ b/tools/include/uapi/linux/sched.h
 @@ -132,6 +132,7 @@ struct clone_args {
@@ -2223,4 +2237,4 @@ index 3bac0a8ceab26..b2e932c25be62 100644
  
  #endif /* _UAPI_LINUX_SCHED_H */
 -- 
-2.41.0
+2.42.0
diff --git a/patches/0003-bcachefs.patch b/patches/0003-bcachefs.patch
index d7ad7bd..5bee813 100644
--- a/patches/0003-bcachefs.patch
+++ b/patches/0003-bcachefs.patch
@@ -1,23 +1,19 @@
-From 5f9d0663e5c9895cfa7238b3456e2a268daf5878 Mon Sep 17 00:00:00 2001
+From 31f38fa87a86e086ffcc063e7e24702064eda50f Mon Sep 17 00:00:00 2001
 From: Piotr Gorski <lucjan.lucjanov@gmail.com>
-Date: Fri, 21 Jul 2023 08:07:37 +0200
+Date: Tue, 29 Aug 2023 12:14:18 +0200
 Subject: [PATCH] bcachefs
 
 Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
 ---
- Documentation/admin-guide/sysctl/vm.rst       |   16 +
- Documentation/filesystems/proc.rst            |   28 +
- MAINTAINERS                                   |   56 +
- arch/arm64/include/asm/spectre.h              |    4 +-
+ MAINTAINERS                                   |   32 +
  arch/powerpc/mm/book3s64/radix_pgtable.c      |    2 +-
- arch/x86/kernel/amd_gart_64.c                 |    2 +-
- block/bdev.c                                  |    2 +-
  block/bio.c                                   |   18 +-
  block/blk-core.c                              |    1 +
  block/blk.h                                   |    1 -
+ drivers/accel/ivpu/ivpu_gem.c                 |    8 +-
+ drivers/accel/ivpu/ivpu_gem.h                 |    2 +-
  drivers/block/virtio_blk.c                    |    4 +-
  drivers/gpu/drm/gud/gud_drv.c                 |    2 +-
- drivers/iommu/dma-iommu.c                     |    2 +-
  drivers/md/bcache/Kconfig                     |   10 +-
  drivers/md/bcache/Makefile                    |    4 +-
  drivers/md/bcache/bcache.h                    |    2 +-
@@ -27,118 +23,125 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  drivers/mtd/spi-nor/debugfs.c                 |    6 +-
  .../ethernet/chelsio/cxgb4/cxgb4_debugfs.c    |    4 +-
  drivers/scsi/sd.c                             |    8 +-
- drivers/xen/grant-dma-ops.c                   |    2 +-
- drivers/xen/swiotlb-xen.c                     |    2 +-
  fs/Kconfig                                    |    1 +
  fs/Makefile                                   |    1 +
- fs/aio.c                                      |   70 +-
- fs/bcachefs/Kconfig                           |   77 +
- fs/bcachefs/Makefile                          |   74 +
- fs/bcachefs/acl.c                             |  412 ++
+ fs/aio.c                                      |   66 +-
+ fs/bcachefs/Kconfig                           |   76 +
+ fs/bcachefs/Makefile                          |   83 +
+ fs/bcachefs/acl.c                             |  412 +++
  fs/bcachefs/acl.h                             |   58 +
- fs/bcachefs/alloc_background.c                | 2209 +++++++++
+ fs/bcachefs/alloc_background.c                | 2157 +++++++++++
  fs/bcachefs/alloc_background.h                |  257 ++
- fs/bcachefs/alloc_foreground.c                | 1536 +++++++
- fs/bcachefs/alloc_foreground.h                |  224 +
+ fs/bcachefs/alloc_foreground.c                | 1571 ++++++++
+ fs/bcachefs/alloc_foreground.h                |  224 ++
  fs/bcachefs/alloc_types.h                     |  126 +
- fs/bcachefs/backpointers.c                    |  873 ++++
+ fs/bcachefs/backpointers.c                    |  873 +++++
  fs/bcachefs/backpointers.h                    |  131 +
  fs/bcachefs/bbpos.h                           |   48 +
- fs/bcachefs/bcachefs.h                        | 1201 +++++
- fs/bcachefs/bcachefs_format.h                 | 2319 ++++++++++
+ fs/bcachefs/bcachefs.h                        | 1146 ++++++
+ fs/bcachefs/bcachefs_format.h                 | 2368 ++++++++++++
  fs/bcachefs/bcachefs_ioctl.h                  |  368 ++
- fs/bcachefs/bkey.c                            | 1063 +++++
- fs/bcachefs/bkey.h                            |  774 ++++
+ fs/bcachefs/bkey.c                            | 1107 ++++++
+ fs/bcachefs/bkey.h                            |  782 ++++
  fs/bcachefs/bkey_buf.h                        |   61 +
  fs/bcachefs/bkey_cmp.h                        |  129 +
- fs/bcachefs/bkey_methods.c                    |  519 +++
- fs/bcachefs/bkey_methods.h                    |  193 +
- fs/bcachefs/bkey_sort.c                       |  201 +
+ fs/bcachefs/bkey_methods.c                    |  456 +++
+ fs/bcachefs/bkey_methods.h                    |  188 +
+ fs/bcachefs/bkey_sort.c                       |  201 ++
  fs/bcachefs/bkey_sort.h                       |   44 +
- fs/bcachefs/bset.c                            | 1587 +++++++
+ fs/bcachefs/bset.c                            | 1587 ++++++++
  fs/bcachefs/bset.h                            |  541 +++
- fs/bcachefs/btree_cache.c                     | 1277 ++++++
+ fs/bcachefs/btree_cache.c                     | 1274 +++++++
  fs/bcachefs/btree_cache.h                     |  130 +
- fs/bcachefs/btree_gc.c                        | 2126 +++++++++
- fs/bcachefs/btree_gc.h                        |  113 +
- fs/bcachefs/btree_io.c                        | 2267 ++++++++++
- fs/bcachefs/btree_io.h                        |  228 +
- fs/bcachefs/btree_iter.c                      | 3214 +++++++++++++
- fs/bcachefs/btree_iter.h                      |  924 ++++
- fs/bcachefs/btree_key_cache.c                 | 1088 +++++
+ fs/bcachefs/btree_gc.c                        | 2127 +++++++++++
+ fs/bcachefs/btree_gc.h                        |  114 +
+ fs/bcachefs/btree_io.c                        | 2245 ++++++++++++
+ fs/bcachefs/btree_io.h                        |  228 ++
+ fs/bcachefs/btree_iter.c                      | 3194 +++++++++++++++++
+ fs/bcachefs/btree_iter.h                      |  940 +++++
+ fs/bcachefs/btree_journal_iter.c              |  531 +++
+ fs/bcachefs/btree_journal_iter.h              |   57 +
+ fs/bcachefs/btree_key_cache.c                 | 1088 ++++++
  fs/bcachefs/btree_key_cache.h                 |   48 +
  fs/bcachefs/btree_locking.c                   |  797 ++++
- fs/bcachefs/btree_locking.h                   |  424 ++
- fs/bcachefs/btree_types.h                     |  743 +++
- fs/bcachefs/btree_update.h                    |  352 ++
- fs/bcachefs/btree_update_interior.c           | 2488 ++++++++++
+ fs/bcachefs/btree_locking.h                   |  423 +++
+ fs/bcachefs/btree_trans_commit.c              | 1156 ++++++
+ fs/bcachefs/btree_types.h                     |  746 ++++
+ fs/bcachefs/btree_update.c                    |  898 +++++
+ fs/bcachefs/btree_update.h                    |  353 ++
+ fs/bcachefs/btree_update_interior.c           | 2488 +++++++++++++
  fs/bcachefs/btree_update_interior.h           |  337 ++
- fs/bcachefs/btree_update_leaf.c               | 2097 +++++++++
- fs/bcachefs/btree_write_buffer.c              |  372 ++
+ fs/bcachefs/btree_write_buffer.c              |  375 ++
  fs/bcachefs/btree_write_buffer.h              |   14 +
  fs/bcachefs/btree_write_buffer_types.h        |   44 +
- fs/bcachefs/buckets.c                         | 2106 +++++++++
- fs/bcachefs/buckets.h                         |  368 ++
+ fs/bcachefs/buckets.c                         | 2107 +++++++++++
+ fs/bcachefs/buckets.h                         |  413 +++
  fs/bcachefs/buckets_types.h                   |   92 +
  fs/bcachefs/buckets_waiting_for_journal.c     |  166 +
  fs/bcachefs/buckets_waiting_for_journal.h     |   15 +
  .../buckets_waiting_for_journal_types.h       |   23 +
  fs/bcachefs/chardev.c                         |  769 ++++
  fs/bcachefs/chardev.h                         |   31 +
- fs/bcachefs/checksum.c                        |  709 +++
- fs/bcachefs/checksum.h                        |  209 +
+ fs/bcachefs/checksum.c                        |  753 ++++
+ fs/bcachefs/checksum.h                        |  211 ++
  fs/bcachefs/clock.c                           |  193 +
  fs/bcachefs/clock.h                           |   38 +
  fs/bcachefs/clock_types.h                     |   37 +
- fs/bcachefs/compress.c                        |  713 +++
+ fs/bcachefs/compress.c                        |  714 ++++
  fs/bcachefs/compress.h                        |   55 +
  fs/bcachefs/counters.c                        |  107 +
  fs/bcachefs/counters.h                        |   17 +
  fs/bcachefs/darray.h                          |   87 +
  fs/bcachefs/data_update.c                     |  562 +++
  fs/bcachefs/data_update.h                     |   43 +
- fs/bcachefs/debug.c                           |  957 ++++
+ fs/bcachefs/debug.c                           |  957 +++++
  fs/bcachefs/debug.h                           |   32 +
- fs/bcachefs/dirent.c                          |  565 +++
+ fs/bcachefs/dirent.c                          |  590 +++
  fs/bcachefs/dirent.h                          |   70 +
- fs/bcachefs/disk_groups.c                     |  555 +++
+ fs/bcachefs/disk_groups.c                     |  556 +++
  fs/bcachefs/disk_groups.h                     |  106 +
- fs/bcachefs/ec.c                              | 1960 ++++++++
- fs/bcachefs/ec.h                              |  263 ++
+ fs/bcachefs/ec.c                              | 1972 ++++++++++
+ fs/bcachefs/ec.h                              |  260 ++
  fs/bcachefs/ec_types.h                        |   41 +
  fs/bcachefs/errcode.c                         |   63 +
- fs/bcachefs/errcode.h                         |  246 +
+ fs/bcachefs/errcode.h                         |  252 ++
  fs/bcachefs/error.c                           |  294 ++
- fs/bcachefs/error.h                           |  206 +
+ fs/bcachefs/error.h                           |  206 ++
  fs/bcachefs/extent_update.c                   |  173 +
  fs/bcachefs/extent_update.h                   |   12 +
- fs/bcachefs/extents.c                         | 1394 ++++++
+ fs/bcachefs/extents.c                         | 1403 ++++++++
  fs/bcachefs/extents.h                         |  757 ++++
  fs/bcachefs/extents_types.h                   |   40 +
  fs/bcachefs/eytzinger.h                       |  281 ++
  fs/bcachefs/fifo.h                            |  127 +
  fs/bcachefs/fs-common.c                       |  501 +++
  fs/bcachefs/fs-common.h                       |   43 +
- fs/bcachefs/fs-io.c                           | 3982 +++++++++++++++++
- fs/bcachefs/fs-io.h                           |   54 +
- fs/bcachefs/fs-ioctl.c                        |  556 +++
+ fs/bcachefs/fs-io-buffered.c                  | 1099 ++++++
+ fs/bcachefs/fs-io-buffered.h                  |   27 +
+ fs/bcachefs/fs-io-direct.c                    |  679 ++++
+ fs/bcachefs/fs-io-direct.h                    |   16 +
+ fs/bcachefs/fs-io-pagecache.c                 |  788 ++++
+ fs/bcachefs/fs-io-pagecache.h                 |  176 +
+ fs/bcachefs/fs-io.c                           | 1250 +++++++
+ fs/bcachefs/fs-io.h                           |  184 +
+ fs/bcachefs/fs-ioctl.c                        |  559 +++
  fs/bcachefs/fs-ioctl.h                        |   81 +
- fs/bcachefs/fs.c                              | 1943 ++++++++
- fs/bcachefs/fs.h                              |  208 +
- fs/bcachefs/fsck.c                            | 2471 ++++++++++
+ fs/bcachefs/fs.c                              | 1961 ++++++++++
+ fs/bcachefs/fs.h                              |  209 ++
+ fs/bcachefs/fsck.c                            | 2483 +++++++++++++
  fs/bcachefs/fsck.h                            |   14 +
- fs/bcachefs/inode.c                           |  925 ++++
- fs/bcachefs/inode.h                           |  201 +
- fs/bcachefs/io.c                              | 3059 +++++++++++++
- fs/bcachefs/io.h                              |  202 +
+ fs/bcachefs/inode.c                           | 1111 ++++++
+ fs/bcachefs/inode.h                           |  204 ++
+ fs/bcachefs/io.c                              | 3051 ++++++++++++++++
+ fs/bcachefs/io.h                              |  202 ++
  fs/bcachefs/io_types.h                        |  165 +
- fs/bcachefs/journal.c                         | 1438 ++++++
+ fs/bcachefs/journal.c                         | 1438 ++++++++
  fs/bcachefs/journal.h                         |  526 +++
- fs/bcachefs/journal_io.c                      | 1863 ++++++++
- fs/bcachefs/journal_io.h                      |   64 +
- fs/bcachefs/journal_reclaim.c                 |  873 ++++
+ fs/bcachefs/journal_io.c                      | 1888 ++++++++++
+ fs/bcachefs/journal_io.h                      |   65 +
+ fs/bcachefs/journal_reclaim.c                 |  874 +++++
  fs/bcachefs/journal_reclaim.h                 |   86 +
- fs/bcachefs/journal_sb.c                      |  219 +
+ fs/bcachefs/journal_sb.c                      |  219 ++
  fs/bcachefs/journal_sb.h                      |   24 +
  fs/bcachefs/journal_seq_blacklist.c           |  322 ++
  fs/bcachefs/journal_seq_blacklist.h           |   22 +
@@ -150,170 +153,124 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  fs/bcachefs/lru.h                             |   69 +
  fs/bcachefs/migrate.c                         |  182 +
  fs/bcachefs/migrate.h                         |    7 +
- fs/bcachefs/move.c                            | 1168 +++++
- fs/bcachefs/move.h                            |   96 +
+ fs/bcachefs/move.c                            | 1162 ++++++
+ fs/bcachefs/move.h                            |   95 +
  fs/bcachefs/move_types.h                      |   36 +
- fs/bcachefs/movinggc.c                        |  421 ++
+ fs/bcachefs/movinggc.c                        |  423 +++
  fs/bcachefs/movinggc.h                        |   12 +
  fs/bcachefs/nocow_locking.c                   |  123 +
  fs/bcachefs/nocow_locking.h                   |   49 +
  fs/bcachefs/nocow_locking_types.h             |   20 +
- fs/bcachefs/opts.c                            |  592 +++
+ fs/bcachefs/opts.c                            |  599 ++++
  fs/bcachefs/opts.h                            |  563 +++
- fs/bcachefs/printbuf.c                        |  415 ++
+ fs/bcachefs/printbuf.c                        |  415 +++
  fs/bcachefs/printbuf.h                        |  284 ++
- fs/bcachefs/quota.c                           |  981 ++++
+ fs/bcachefs/quota.c                           |  981 +++++
  fs/bcachefs/quota.h                           |   74 +
  fs/bcachefs/quota_types.h                     |   43 +
- fs/bcachefs/rebalance.c                       |  364 ++
+ fs/bcachefs/rebalance.c                       |  368 ++
  fs/bcachefs/rebalance.h                       |   28 +
  fs/bcachefs/rebalance_types.h                 |   26 +
- fs/bcachefs/recovery.c                        | 1670 +++++++
- fs/bcachefs/recovery.h                        |   60 +
+ fs/bcachefs/recovery.c                        | 1057 ++++++
+ fs/bcachefs/recovery.h                        |   33 +
+ fs/bcachefs/recovery_types.h                  |   48 +
  fs/bcachefs/reflink.c                         |  399 ++
  fs/bcachefs/reflink.h                         |   81 +
- fs/bcachefs/replicas.c                        | 1059 +++++
+ fs/bcachefs/replicas.c                        | 1059 ++++++
  fs/bcachefs/replicas.h                        |   91 +
  fs/bcachefs/replicas_types.h                  |   27 +
+ fs/bcachefs/sb-clean.c                        |  395 ++
+ fs/bcachefs/sb-clean.h                        |   16 +
+ fs/bcachefs/sb-members.c                      |  173 +
+ fs/bcachefs/sb-members.h                      |  176 +
  fs/bcachefs/seqmutex.h                        |   48 +
  fs/bcachefs/siphash.c                         |  173 +
  fs/bcachefs/siphash.h                         |   87 +
+ fs/bcachefs/six.c                             |  918 +++++
+ fs/bcachefs/six.h                             |  388 ++
+ fs/bcachefs/snapshot.c                        | 1687 +++++++++
+ fs/bcachefs/snapshot.h                        |  272 ++
  fs/bcachefs/str_hash.h                        |  370 ++
- fs/bcachefs/subvolume.c                       | 1749 ++++++++
- fs/bcachefs/subvolume.h                       |  258 ++
+ fs/bcachefs/subvolume.c                       |  451 +++
+ fs/bcachefs/subvolume.h                       |   35 +
  fs/bcachefs/subvolume_types.h                 |   31 +
- fs/bcachefs/super-io.c                        | 1714 +++++++
- fs/bcachefs/super-io.h                        |  142 +
- fs/bcachefs/super.c                           | 2007 +++++++++
- fs/bcachefs/super.h                           |  266 ++
- fs/bcachefs/super_types.h                     |   51 +
- fs/bcachefs/sysfs.c                           | 1064 +++++
+ fs/bcachefs/super-io.c                        | 1265 +++++++
+ fs/bcachefs/super-io.h                        |  133 +
+ fs/bcachefs/super.c                           | 2015 +++++++++++
+ fs/bcachefs/super.h                           |   52 +
+ fs/bcachefs/super_types.h                     |   52 +
+ fs/bcachefs/sysfs.c                           | 1059 ++++++
  fs/bcachefs/sysfs.h                           |   48 +
- fs/bcachefs/tests.c                           |  939 ++++
+ fs/bcachefs/tests.c                           |  970 +++++
  fs/bcachefs/tests.h                           |   15 +
  fs/bcachefs/trace.c                           |   16 +
- fs/bcachefs/trace.h                           | 1247 ++++++
+ fs/bcachefs/trace.h                           | 1265 +++++++
  fs/bcachefs/two_state_shared_lock.c           |    8 +
  fs/bcachefs/two_state_shared_lock.h           |   59 +
- fs/bcachefs/util.c                            | 1137 +++++
- fs/bcachefs/util.h                            |  846 ++++
- fs/bcachefs/varint.c                          |  122 +
+ fs/bcachefs/util.c                            | 1144 ++++++
+ fs/bcachefs/util.h                            |  851 +++++
+ fs/bcachefs/varint.c                          |  123 +
  fs/bcachefs/varint.h                          |   11 +
  fs/bcachefs/vstructs.h                        |   63 +
- fs/bcachefs/xattr.c                           |  648 +++
+ fs/bcachefs/xattr.c                           |  649 ++++
  fs/bcachefs/xattr.h                           |   50 +
  fs/dcache.c                                   |   12 +-
  fs/inode.c                                    |  218 +-
  fs/iomap/buffered-io.c                        |   45 +-
- fs/super.c                                    |   40 +-
  fs/xfs/xfs_iomap.c                            |    3 +
  fs/xfs/xfs_mount.h                            |    2 +
  fs/xfs/xfs_super.c                            |    6 +-
- include/asm-generic/codetag.lds.h             |   15 +
- include/asm-generic/vmlinux.lds.h             |    3 +
- include/linux/alloc_tag.h                     |  160 +
  include/linux/bio.h                           |    7 +-
  include/linux/blkdev.h                        |    1 +
  .../md/bcache => include/linux}/closure.h     |   46 +-
- include/linux/codetag.h                       |  110 +
  include/linux/dcache.h                        |    1 +
- include/linux/dma-map-ops.h                   |    2 +-
- include/linux/dynamic_fault.h                 |   79 +
  include/linux/exportfs.h                      |    6 +
- include/linux/fortify-string.h                |    5 +-
- include/linux/fs.h                            |   16 +-
+ include/linux/fs.h                            |   15 +-
  include/linux/generic-radix-tree.h            |   68 +-
- include/linux/gfp.h                           |  111 +-
- include/linux/gfp_types.h                     |  101 +-
- include/linux/hrtimer.h                       |    2 +-
+ include/linux/gfp_types.h                     |   90 +-
  include/linux/iomap.h                         |    1 +
  include/linux/list_bl.h                       |   22 +
  include/linux/lockdep.h                       |   10 +
  include/linux/lockdep_types.h                 |    2 +-
  include/linux/mean_and_variance.h             |  198 +
- include/linux/memcontrol.h                    |   56 +-
- include/linux/mempool.h                       |   73 +-
- include/linux/mm.h                            |    8 +
- include/linux/mm_types.h                      |    4 +-
  include/linux/nodemask.h                      |    2 +-
  include/linux/nodemask_types.h                |    9 +
- include/linux/page_ext.h                      |    1 -
- include/linux/pagemap.h                       |    9 +-
- include/linux/percpu.h                        |   19 +-
- include/linux/pgalloc_tag.h                   |  105 +
  include/linux/prandom.h                       |    1 -
- include/linux/rhashtable-types.h              |    9 +-
- include/linux/sched.h                         |   29 +-
+ include/linux/sched.h                         |    5 +-
  include/linux/seq_buf.h                       |    2 +
  include/linux/shrinker.h                      |    9 +-
- include/linux/six.h                           |  388 ++
- include/linux/slab.h                          |  180 +-
- include/linux/slab_def.h                      |    2 +-
- include/linux/slub_def.h                      |    4 +-
- include/linux/string.h                        |    5 +-
  include/linux/string_helpers.h                |   13 +-
- include/linux/time_namespace.h                |    2 +
- include/linux/vmalloc.h                       |   60 +-
- init/Kconfig                                  |    4 +
  init/init_task.c                              |    1 +
- kernel/Kconfig.locks                          |    3 +
- kernel/dma/mapping.c                          |    4 +-
- kernel/locking/Makefile                       |    1 +
  kernel/locking/lockdep.c                      |   46 +
+ kernel/locking/mutex.c                        |    3 +
  kernel/locking/osq_lock.c                     |    2 +
- kernel/locking/six.c                          |  893 ++++
- kernel/module/main.c                          |   25 +-
  kernel/stacktrace.c                           |    2 +
  lib/Kconfig                                   |    3 +
- lib/Kconfig.debug                             |   54 +
- lib/Makefile                                  |    9 +-
- lib/alloc_tag.c                               |  225 +
- {drivers/md/bcache => lib}/closure.c          |   36 +-
- lib/codetag.c                                 |  393 ++
- lib/dynamic_fault.c                           |  371 ++
+ lib/Kconfig.debug                             |   18 +
+ lib/Makefile                                  |    2 +
+ {drivers/md/bcache => lib}/closure.c          |   41 +-
  lib/errname.c                                 |    1 +
  lib/generic-radix-tree.c                      |   76 +-
  lib/iov_iter.c                                |   43 +-
  lib/math/Kconfig                              |    3 +
  lib/math/Makefile                             |    2 +
  lib/math/mean_and_variance.c                  |  158 +
- lib/math/mean_and_variance_test.c             |  239 +
- lib/rhashtable.c                              |   42 +-
+ lib/math/mean_and_variance_test.c             |  239 ++
+ lib/rhashtable.c                              |    9 +-
  lib/seq_buf.c                                 |   10 +
- lib/string.c                                  |   19 +
  lib/string_helpers.c                          |   26 +-
  lib/test-string_helpers.c                     |    4 +-
- mm/Makefile                                   |    2 +-
- mm/compaction.c                               |   10 +-
- mm/filemap.c                                  |    6 +-
- mm/huge_memory.c                              |    2 +
  mm/hugetlb.c                                  |    8 +-
- mm/kfence/core.c                              |   14 +-
- mm/kfence/kfence.h                            |    4 +-
  mm/madvise.c                                  |   61 +
- mm/memcontrol.c                               |   56 +-
- mm/mempolicy.c                                |   42 +-
- mm/mempool.c                                  |   34 +-
- mm/mm_init.c                                  |    1 +
  mm/oom_kill.c                                 |   23 -
- mm/page_alloc.c                               |   66 +-
- mm/page_ext.c                                 |   13 +
- mm/page_owner.c                               |    2 +-
- mm/percpu-internal.h                          |   26 +-
- mm/percpu.c                                   |  120 +-
- {lib => mm}/show_mem.c                        |   37 +
- mm/slab.c                                     |   24 +-
- mm/slab.h                                     |  252 +-
- mm/slab_common.c                              |  148 +-
- mm/slub.c                                     |   26 +-
- mm/util.c                                     |   44 +-
- mm/vmalloc.c                                  |   88 +-
+ mm/show_mem.c                                 |   22 +
+ mm/slab.h                                     |    6 +-
+ mm/slab_common.c                              |   52 +-
  mm/vmscan.c                                   |   99 +-
  scripts/Kbuild.include                        |   10 +
  scripts/Makefile.lib                          |    2 +-
  scripts/kallsyms.c                            |   13 +
- scripts/module.lds.S                          |    7 +
- 308 files changed, 96733 insertions(+), 930 deletions(-)
+ 265 files changed, 95211 insertions(+), 312 deletions(-)
  create mode 100644 fs/bcachefs/Kconfig
  create mode 100644 fs/bcachefs/Makefile
  create mode 100644 fs/bcachefs/acl.c
@@ -347,15 +304,18 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  create mode 100644 fs/bcachefs/btree_io.h
  create mode 100644 fs/bcachefs/btree_iter.c
  create mode 100644 fs/bcachefs/btree_iter.h
+ create mode 100644 fs/bcachefs/btree_journal_iter.c
+ create mode 100644 fs/bcachefs/btree_journal_iter.h
  create mode 100644 fs/bcachefs/btree_key_cache.c
  create mode 100644 fs/bcachefs/btree_key_cache.h
  create mode 100644 fs/bcachefs/btree_locking.c
  create mode 100644 fs/bcachefs/btree_locking.h
+ create mode 100644 fs/bcachefs/btree_trans_commit.c
  create mode 100644 fs/bcachefs/btree_types.h
+ create mode 100644 fs/bcachefs/btree_update.c
  create mode 100644 fs/bcachefs/btree_update.h
  create mode 100644 fs/bcachefs/btree_update_interior.c
  create mode 100644 fs/bcachefs/btree_update_interior.h
- create mode 100644 fs/bcachefs/btree_update_leaf.c
  create mode 100644 fs/bcachefs/btree_write_buffer.c
  create mode 100644 fs/bcachefs/btree_write_buffer.h
  create mode 100644 fs/bcachefs/btree_write_buffer_types.h
@@ -401,6 +361,12 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  create mode 100644 fs/bcachefs/fifo.h
  create mode 100644 fs/bcachefs/fs-common.c
  create mode 100644 fs/bcachefs/fs-common.h
+ create mode 100644 fs/bcachefs/fs-io-buffered.c
+ create mode 100644 fs/bcachefs/fs-io-buffered.h
+ create mode 100644 fs/bcachefs/fs-io-direct.c
+ create mode 100644 fs/bcachefs/fs-io-direct.h
+ create mode 100644 fs/bcachefs/fs-io-pagecache.c
+ create mode 100644 fs/bcachefs/fs-io-pagecache.h
  create mode 100644 fs/bcachefs/fs-io.c
  create mode 100644 fs/bcachefs/fs-io.h
  create mode 100644 fs/bcachefs/fs-ioctl.c
@@ -452,14 +418,23 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  create mode 100644 fs/bcachefs/rebalance_types.h
  create mode 100644 fs/bcachefs/recovery.c
  create mode 100644 fs/bcachefs/recovery.h
+ create mode 100644 fs/bcachefs/recovery_types.h
  create mode 100644 fs/bcachefs/reflink.c
  create mode 100644 fs/bcachefs/reflink.h
  create mode 100644 fs/bcachefs/replicas.c
  create mode 100644 fs/bcachefs/replicas.h
  create mode 100644 fs/bcachefs/replicas_types.h
+ create mode 100644 fs/bcachefs/sb-clean.c
+ create mode 100644 fs/bcachefs/sb-clean.h
+ create mode 100644 fs/bcachefs/sb-members.c
+ create mode 100644 fs/bcachefs/sb-members.h
  create mode 100644 fs/bcachefs/seqmutex.h
  create mode 100644 fs/bcachefs/siphash.c
  create mode 100644 fs/bcachefs/siphash.h
+ create mode 100644 fs/bcachefs/six.c
+ create mode 100644 fs/bcachefs/six.h
+ create mode 100644 fs/bcachefs/snapshot.c
+ create mode 100644 fs/bcachefs/snapshot.h
  create mode 100644 fs/bcachefs/str_hash.h
  create mode 100644 fs/bcachefs/subvolume.c
  create mode 100644 fs/bcachefs/subvolume.h
@@ -484,109 +459,18 @@ Signed-off-by: Piotr Gorski <lucjan.lucjanov@gmail.com>
  create mode 100644 fs/bcachefs/vstructs.h
  create mode 100644 fs/bcachefs/xattr.c
  create mode 100644 fs/bcachefs/xattr.h
- create mode 100644 include/asm-generic/codetag.lds.h
- create mode 100644 include/linux/alloc_tag.h
  rename {drivers/md/bcache => include/linux}/closure.h (93%)
- create mode 100644 include/linux/codetag.h
- create mode 100644 include/linux/dynamic_fault.h
  create mode 100644 include/linux/mean_and_variance.h
  create mode 100644 include/linux/nodemask_types.h
- create mode 100644 include/linux/pgalloc_tag.h
- create mode 100644 include/linux/six.h
- create mode 100644 kernel/locking/six.c
- create mode 100644 lib/alloc_tag.c
- rename {drivers/md/bcache => lib}/closure.c (88%)
- create mode 100644 lib/codetag.c
- create mode 100644 lib/dynamic_fault.c
+ rename {drivers/md/bcache => lib}/closure.c (85%)
  create mode 100644 lib/math/mean_and_variance.c
  create mode 100644 lib/math/mean_and_variance_test.c
- rename {lib => mm}/show_mem.c (57%)
 
-diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
-index 45ba1f4dc..0a012ac13 100644
---- a/Documentation/admin-guide/sysctl/vm.rst
-+++ b/Documentation/admin-guide/sysctl/vm.rst
-@@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm:
- - legacy_va_layout
- - lowmem_reserve_ratio
- - max_map_count
-+- mem_profiling         (only if CONFIG_MEM_ALLOC_PROFILING=y)
- - memory_failure_early_kill
- - memory_failure_recovery
- - min_free_kbytes
-@@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation.
- The default value is 65530.
- 
- 
-+mem_profiling
-+==============
-+
-+Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y)
-+
-+1: Enable memory profiling.
-+
-+0: Disabld memory profiling.
-+
-+Enabling memory profiling introduces a small performance overhead for all
-+memory allocations.
-+
-+The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT.
-+
-+
- memory_failure_early_kill:
- ==========================
- 
-diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
-index 7897a7daf..810f851e6 100644
---- a/Documentation/filesystems/proc.rst
-+++ b/Documentation/filesystems/proc.rst
-@@ -683,6 +683,7 @@ files are there, and which are missing.
-  ============ ===============================================================
-  File         Content
-  ============ ===============================================================
-+ allocinfo    Memory allocations profiling information
-  apm          Advanced power management info
-  buddyinfo    Kernel memory allocator information (see text)	(2.5)
-  bus          Directory containing bus specific information
-@@ -942,6 +943,33 @@ also be allocatable although a lot of filesystem metadata may have to be
- reclaimed to achieve this.
- 
- 
-+allocinfo
-+~~~~~~~
-+
-+Provides information about memory allocations at all locations in the code
-+base. Each allocation in the code is identified by its source file, line
-+number, module and the function calling the allocation. The number of bytes
-+allocated at each location is reported.
-+
-+Example output.
-+
-+::
-+
-+    > cat /proc/allocinfo
-+
-+      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page
-+     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
-+     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
-+     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
-+     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one
-+     1.16MiB     fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc
-+     1.00MiB     mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare
-+      734KiB     fs/xfs/kmem.c:20 module:xfs func:kmem_alloc
-+      640KiB     kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func
-+      640KiB     drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf
-+      ...
-+
-+
- meminfo
- ~~~~~~~
- 
 diff --git a/MAINTAINERS b/MAINTAINERS
-index 35e195946..22c57b3bc 100644
+index 4cc6bf79f..9c7fa5956 100644
 --- a/MAINTAINERS
 +++ b/MAINTAINERS
-@@ -3522,6 +3522,14 @@ W:	http://bcache.evilpiepirate.org
+@@ -3458,6 +3458,14 @@ W:	http://bcache.evilpiepirate.org
  C:	irc://irc.oftc.net/bcache
  F:	drivers/md/bcache/
  
@@ -601,7 +485,7 @@ index 35e195946..22c57b3bc 100644
  BDISP ST MEDIA DRIVER
  M:	Fabien Dessenne <fabien.dessenne@foss.st.com>
  L:	linux-media@vger.kernel.org
-@@ -5064,6 +5072,14 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
+@@ -5027,6 +5035,14 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core
  F:	Documentation/devicetree/bindings/timer/
  F:	drivers/clocksource/
  
@@ -616,21 +500,7 @@ index 35e195946..22c57b3bc 100644
  CMPC ACPI DRIVER
  M:	Thadeu Lima de Souza Cascardo <cascardo@holoscopio.com>
  M:	Daniel Oliveira Nascimento <don@syst.com.br>
-@@ -5114,6 +5130,13 @@ S:	Supported
- F:	Documentation/process/code-of-conduct-interpretation.rst
- F:	Documentation/process/code-of-conduct.rst
- 
-+CODE TAGGING
-+M:	Suren Baghdasaryan <surenb@google.com>
-+M:	Kent Overstreet <kent.overstreet@linux.dev>
-+S:	Maintained
-+F:	include/linux/codetag.h
-+F:	lib/codetag.c
-+
- COMEDI DRIVERS
- M:	Ian Abbott <abbotti@mev.co.uk>
- M:	H Hartley Sweeten <hsweeten@visionengravers.com>
-@@ -8662,6 +8685,13 @@ F:	Documentation/devicetree/bindings/power/power?domain*
+@@ -8673,6 +8689,13 @@ F:	Documentation/devicetree/bindings/power/power?domain*
  F:	drivers/base/power/domain*.c
  F:	include/linux/pm_domain.h
  
@@ -644,9 +514,9 @@ index 35e195946..22c57b3bc 100644
  GENERIC RESISTIVE TOUCHSCREEN ADC DRIVER
  M:	Eugen Hristev <eugen.hristev@microchip.com>
  L:	linux-input@vger.kernel.org
-@@ -12850,6 +12880,15 @@ F:	Documentation/devicetree/bindings/net/ieee802154/mcr20a.txt
- F:	drivers/net/ieee802154/mcr20a.c
- F:	drivers/net/ieee802154/mcr20a.h
+@@ -12925,6 +12948,15 @@ S:	Maintained
+ F:	drivers/net/mdio/mdio-regmap.c
+ F:	include/linux/mdio/mdio-regmap.h
  
 +MEAN AND VARIANCE LIBRARY
 +M:	Daniel B. Hill <daniel@gluo.nz>
@@ -660,54 +530,8 @@ index 35e195946..22c57b3bc 100644
  MEASUREMENT COMPUTING CIO-DAC IIO DRIVER
  M:	William Breathitt Gray <william.gray@linaro.org>
  L:	linux-iio@vger.kernel.org
-@@ -13489,6 +13528,15 @@ F:	mm/memblock.c
- F:	mm/mm_init.c
- F:	tools/testing/memblock/
- 
-+MEMORY ALLOCATION PROFILING
-+M:	Suren Baghdasaryan <surenb@google.com>
-+M:	Kent Overstreet <kent.overstreet@linux.dev>
-+S:	Maintained
-+F:	include/linux/alloc_tag.h
-+F:	include/linux/codetag_ctx.h
-+F:	lib/alloc_tag.c
-+F:	lib/pgalloc_tag.c
-+
- MEMORY CONTROLLER DRIVERS
- M:	Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
- L:	linux-kernel@vger.kernel.org
-@@ -19376,6 +19424,14 @@ S:	Maintained
- W:	http://www.winischhofer.at/linuxsisusbvga.shtml
- F:	drivers/usb/misc/sisusbvga/
- 
-+SIX LOCKS
-+M:	Kent Overstreet <kent.overstreet@linux.dev>
-+L:	linux-bcachefs@vger.kernel.org
-+S:	Supported
-+C:	irc://irc.oftc.net/bcache
-+F:	include/linux/six.h
-+F:	kernel/locking/six.c
-+
- SL28 CPLD MFD DRIVER
- M:	Michael Walle <michael@walle.cc>
- S:	Maintained
-diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h
-index db7b371b3..31823d971 100644
---- a/arch/arm64/include/asm/spectre.h
-+++ b/arch/arm64/include/asm/spectre.h
-@@ -13,8 +13,8 @@
- #define __BP_HARDEN_HYP_VECS_SZ	((BP_HARDEN_EL2_SLOTS - 1) * SZ_2K)
- 
- #ifndef __ASSEMBLY__
--
--#include <linux/percpu.h>
-+#include <linux/smp.h>
-+#include <asm/percpu.h>
- 
- #include <asm/cpufeature.h>
- #include <asm/virt.h>
 diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
-index e8db8c8ef..1a3bd656f 100644
+index e7ea492ac..5936205bf 100644
 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
 +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
 @@ -261,7 +261,7 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e
@@ -719,34 +543,8 @@ index e8db8c8ef..1a3bd656f 100644
  
  	pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
  		exec ? " (exec)" : "");
-diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
-index 56a917df4..842a0ec5e 100644
---- a/arch/x86/kernel/amd_gart_64.c
-+++ b/arch/x86/kernel/amd_gart_64.c
-@@ -676,7 +676,7 @@ static const struct dma_map_ops gart_dma_ops = {
- 	.get_sgtable			= dma_common_get_sgtable,
- 	.dma_supported			= dma_direct_supported,
- 	.get_required_mask		= dma_direct_get_required_mask,
--	.alloc_pages			= dma_direct_alloc_pages,
-+	.alloc_pages_op			= dma_direct_alloc_pages,
- 	.free_pages			= dma_direct_free_pages,
- };
- 
-diff --git a/block/bdev.c b/block/bdev.c
-index 21c63bfef..a4d7e8732 100644
---- a/block/bdev.c
-+++ b/block/bdev.c
-@@ -934,7 +934,7 @@ EXPORT_SYMBOL(lookup_bdev);
- 
- int __invalidate_device(struct block_device *bdev, bool kill_dirty)
- {
--	struct super_block *sb = get_super(bdev);
-+	struct super_block *sb = try_get_super(bdev);
- 	int res = 0;
- 
- 	if (sb) {
 diff --git a/block/bio.c b/block/bio.c
-index 043944fd4..70b5c987b 100644
+index 867217921..425b3da39 100644
 --- a/block/bio.c
 +++ b/block/bio.c
 @@ -606,15 +606,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
@@ -768,7 +566,7 @@ index 043944fd4..70b5c987b 100644
  
  /**
   * bio_truncate - truncate the bio to small size of @new_size
-@@ -1245,7 +1245,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+@@ -1252,7 +1252,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
  	struct page **pages = (struct page **)bv;
  	ssize_t size, left;
  	unsigned len, i = 0;
@@ -777,7 +575,7 @@ index 043944fd4..70b5c987b 100644
  	int ret = 0;
  
  	/*
-@@ -1274,10 +1274,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
+@@ -1281,10 +1281,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
  
  	nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
  
@@ -793,7 +591,7 @@ index 043944fd4..70b5c987b 100644
  	if (unlikely(!size)) {
  		ret = -EFAULT;
  		goto out;
-@@ -1481,6 +1483,7 @@ void bio_set_pages_dirty(struct bio *bio)
+@@ -1490,6 +1492,7 @@ void bio_set_pages_dirty(struct bio *bio)
  			set_page_dirty_lock(bvec->bv_page);
  	}
  }
@@ -801,7 +599,7 @@ index 043944fd4..70b5c987b 100644
  
  /*
   * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
-@@ -1540,6 +1543,7 @@ void bio_check_pages_dirty(struct bio *bio)
+@@ -1549,6 +1552,7 @@ void bio_check_pages_dirty(struct bio *bio)
  	spin_unlock_irqrestore(&bio_dirty_lock, flags);
  	schedule_work(&bio_dirty_work);
  }
@@ -810,10 +608,10 @@ index 043944fd4..70b5c987b 100644
  static inline bool bio_remaining_done(struct bio *bio)
  {
 diff --git a/block/blk-core.c b/block/blk-core.c
-index 3fc68b944..1f23abb7d 100644
+index 9866468c7..9d51e9894 100644
 --- a/block/blk-core.c
 +++ b/block/blk-core.c
-@@ -205,6 +205,7 @@ const char *blk_status_to_str(blk_status_t status)
+@@ -208,6 +208,7 @@ const char *blk_status_to_str(blk_status_t status)
  		return "<null>";
  	return blk_errors[idx].name;
  }
@@ -822,7 +620,7 @@ index 3fc68b944..1f23abb7d 100644
  /**
   * blk_sync_queue - cancel any pending callbacks on a queue
 diff --git a/block/blk.h b/block/blk.h
-index 45547bcf1..f20f9ca03 100644
+index 608c5dcc5..47e03fc44 100644
 --- a/block/blk.h
 +++ b/block/blk.h
 @@ -251,7 +251,6 @@ static inline void bio_integrity_free(struct bio *bio)
@@ -833,11 +631,64 @@ index 45547bcf1..f20f9ca03 100644
  
  bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
  		unsigned int nr_segs);
+diff --git a/drivers/accel/ivpu/ivpu_gem.c b/drivers/accel/ivpu/ivpu_gem.c
+index 9967fcfa2..4e8122fb6 100644
+--- a/drivers/accel/ivpu/ivpu_gem.c
++++ b/drivers/accel/ivpu/ivpu_gem.c
+@@ -61,7 +61,7 @@ static void prime_unmap_pages_locked(struct ivpu_bo *bo)
+ static const struct ivpu_bo_ops prime_ops = {
+ 	.type = IVPU_BO_TYPE_PRIME,
+ 	.name = "prime",
+-	.alloc_pages = prime_alloc_pages_locked,
++	.alloc_pages_op = prime_alloc_pages_locked,
+ 	.free_pages = prime_free_pages_locked,
+ 	.map_pages = prime_map_pages_locked,
+ 	.unmap_pages = prime_unmap_pages_locked,
+@@ -134,7 +134,7 @@ static void ivpu_bo_unmap_pages_locked(struct ivpu_bo *bo)
+ static const struct ivpu_bo_ops shmem_ops = {
+ 	.type = IVPU_BO_TYPE_SHMEM,
+ 	.name = "shmem",
+-	.alloc_pages = shmem_alloc_pages_locked,
++	.alloc_pages_op = shmem_alloc_pages_locked,
+ 	.free_pages = shmem_free_pages_locked,
+ 	.map_pages = ivpu_bo_map_pages_locked,
+ 	.unmap_pages = ivpu_bo_unmap_pages_locked,
+@@ -186,7 +186,7 @@ static void internal_free_pages_locked(struct ivpu_bo *bo)
+ static const struct ivpu_bo_ops internal_ops = {
+ 	.type = IVPU_BO_TYPE_INTERNAL,
+ 	.name = "internal",
+-	.alloc_pages = internal_alloc_pages_locked,
++	.alloc_pages_op = internal_alloc_pages_locked,
+ 	.free_pages = internal_free_pages_locked,
+ 	.map_pages = ivpu_bo_map_pages_locked,
+ 	.unmap_pages = ivpu_bo_unmap_pages_locked,
+@@ -200,7 +200,7 @@ static int __must_check ivpu_bo_alloc_and_map_pages_locked(struct ivpu_bo *bo)
+ 	lockdep_assert_held(&bo->lock);
+ 	drm_WARN_ON(&vdev->drm, bo->sgt);
+ 
+-	ret = bo->ops->alloc_pages(bo);
++	ret = bo->ops->alloc_pages_op(bo);
+ 	if (ret) {
+ 		ivpu_err(vdev, "Failed to allocate pages for BO: %d", ret);
+ 		return ret;
+diff --git a/drivers/accel/ivpu/ivpu_gem.h b/drivers/accel/ivpu/ivpu_gem.h
+index 6b0ceda5f..b81cf2af0 100644
+--- a/drivers/accel/ivpu/ivpu_gem.h
++++ b/drivers/accel/ivpu/ivpu_gem.h
+@@ -42,7 +42,7 @@ enum ivpu_bo_type {
+ struct ivpu_bo_ops {
+ 	enum ivpu_bo_type type;
+ 	const char *name;
+-	int (*alloc_pages)(struct ivpu_bo *bo);
++	int (*alloc_pages_op)(struct ivpu_bo *bo);
+ 	void (*free_pages)(struct ivpu_bo *bo);
+ 	int (*map_pages)(struct ivpu_bo *bo);
+ 	void (*unmap_pages)(struct ivpu_bo *bo);
 diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
-index b47358da9..be10661f1 100644
+index 1fe011676..59140424d 100644
 --- a/drivers/block/virtio_blk.c
 +++ b/drivers/block/virtio_blk.c
-@@ -990,9 +990,9 @@ static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
+@@ -986,9 +986,9 @@ static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
  	nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9);
  
  	string_get_size(nblocks, queue_logical_block_size(q),
@@ -862,19 +713,6 @@ index 9d7bf8ee4..6b1748e1f 100644
  	seq_printf(m, "Max buffer size: %s\n", buf);
  	seq_printf(m, "Number of errors:  %u\n", gdrm->stats_num_errors);
  
-diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
-index 7a9f0b0bd..76a9d5ca4 100644
---- a/drivers/iommu/dma-iommu.c
-+++ b/drivers/iommu/dma-iommu.c
-@@ -1556,7 +1556,7 @@ static const struct dma_map_ops iommu_dma_ops = {
- 	.flags			= DMA_F_PCI_P2PDMA_SUPPORTED,
- 	.alloc			= iommu_dma_alloc,
- 	.free			= iommu_dma_free,
--	.alloc_pages		= dma_common_alloc_pages,
-+	.alloc_pages_op		= dma_common_alloc_pages,
- 	.free_pages		= dma_common_free_pages,
- 	.alloc_noncontiguous	= iommu_dma_alloc_noncontiguous,
- 	.free_noncontiguous	= iommu_dma_free_noncontiguous,
 diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
 index 529c9d04e..b2d10063d 100644
 --- a/drivers/md/bcache/Kconfig
@@ -917,7 +755,7 @@ index 5b87e5967..054e8a33a 100644
 +	journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
  	util.o writeback.o features.o
 diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
-index aebb7ef10..c8b4914ad 100644
+index 5a79bb3c2..7c0d00432 100644
 --- a/drivers/md/bcache/bcache.h
 +++ b/drivers/md/bcache/bcache.h
 @@ -179,6 +179,7 @@
@@ -937,10 +775,10 @@ index aebb7ef10..c8b4914ad 100644
  struct bucket {
  	atomic_t	pin;
 diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
-index 077149c40..d43079d45 100644
+index 0ae2b3676..4affe5875 100644
 --- a/drivers/md/bcache/super.c
 +++ b/drivers/md/bcache/super.c
-@@ -2911,7 +2911,6 @@ static int __init bcache_init(void)
+@@ -2905,7 +2905,6 @@ static int __init bcache_init(void)
  		goto err;
  
  	bch_debug_init();
@@ -970,19 +808,19 @@ index 6f3cb7c92..f61ab1bad 100644
  
  #ifdef CONFIG_BCACHE_DEBUG
 diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
-index e46330815..b5dfaf680 100644
+index b6f4be25b..a09ce965c 100644
 --- a/drivers/mmc/core/block.c
 +++ b/drivers/mmc/core/block.c
-@@ -2509,7 +2509,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
+@@ -2510,7 +2510,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
  
  	blk_queue_write_cache(md->queue.queue, cache_enabled, fua_enabled);
  
 -	string_get_size((u64)size, 512, STRING_UNITS_2,
 +	string_get_size((u64)size, 512, STRING_SIZE_BASE2,
  			cap_str, sizeof(cap_str));
- 	pr_info("%s: %s %s %s %s\n",
+ 	pr_info("%s: %s %s %s%s\n",
  		md->disk->disk_name, mmc_card_id(card), mmc_card_name(card),
-@@ -2705,7 +2705,7 @@ static int mmc_blk_alloc_rpmb_part(struct mmc_card *card,
+@@ -2706,7 +2706,7 @@ static int mmc_blk_alloc_rpmb_part(struct mmc_card *card,
  
  	list_add(&rpmb->node, &md->rpmbs);
  
@@ -1037,10 +875,10 @@ index 14e0d989c..7d5fbebd3 100644
  }
  
 diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
-index 1624d528a..bf0a1907b 100644
+index 3c668cfb1..c9abe8f9a 100644
 --- a/drivers/scsi/sd.c
 +++ b/drivers/scsi/sd.c
-@@ -2580,10 +2580,10 @@ sd_print_capacity(struct scsi_disk *sdkp,
+@@ -2681,10 +2681,10 @@ sd_print_capacity(struct scsi_disk *sdkp,
  	if (!sdkp->first_scan && old_capacity == sdkp->capacity)
  		return;
  
@@ -1055,31 +893,6 @@ index 1624d528a..bf0a1907b 100644
  
  	sd_printk(KERN_NOTICE, sdkp,
  		  "%llu %d-byte logical blocks: (%s/%s)\n",
-diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
-index 9784a77fa..6c7d984f1 100644
---- a/drivers/xen/grant-dma-ops.c
-+++ b/drivers/xen/grant-dma-ops.c
-@@ -282,7 +282,7 @@ static int xen_grant_dma_supported(struct device *dev, u64 mask)
- static const struct dma_map_ops xen_grant_dma_ops = {
- 	.alloc = xen_grant_dma_alloc,
- 	.free = xen_grant_dma_free,
--	.alloc_pages = xen_grant_dma_alloc_pages,
-+	.alloc_pages_op = xen_grant_dma_alloc_pages,
- 	.free_pages = xen_grant_dma_free_pages,
- 	.mmap = dma_common_mmap,
- 	.get_sgtable = dma_common_get_sgtable,
-diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
-index 67aa74d20..5ab261615 100644
---- a/drivers/xen/swiotlb-xen.c
-+++ b/drivers/xen/swiotlb-xen.c
-@@ -403,6 +403,6 @@ const struct dma_map_ops xen_swiotlb_dma_ops = {
- 	.dma_supported = xen_swiotlb_dma_supported,
- 	.mmap = dma_common_mmap,
- 	.get_sgtable = dma_common_get_sgtable,
--	.alloc_pages = dma_common_alloc_pages,
-+	.alloc_pages_op = dma_common_alloc_pages,
- 	.free_pages = dma_common_free_pages,
- };
 diff --git a/fs/Kconfig b/fs/Kconfig
 index 18d034ec7..b05c45f63 100644
 --- a/fs/Kconfig
@@ -1093,10 +906,10 @@ index 18d034ec7..b05c45f63 100644
  
  endif # BLOCK
 diff --git a/fs/Makefile b/fs/Makefile
-index 5bfdbf0d7..977a05cae 100644
+index e513aaee0..cd357ea45 100644
 --- a/fs/Makefile
 +++ b/fs/Makefile
-@@ -129,6 +129,7 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
+@@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
  obj-$(CONFIG_BTRFS_FS)		+= btrfs/
  obj-$(CONFIG_GFS2_FS)           += gfs2/
  obj-$(CONFIG_F2FS_FS)		+= f2fs/
@@ -1105,10 +918,10 @@ index 5bfdbf0d7..977a05cae 100644
  obj-$(CONFIG_PSTORE)		+= pstore/
  obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
 diff --git a/fs/aio.c b/fs/aio.c
-index b0b17bd09..b3e14a9fe 100644
+index 77e33619d..5db996acc 100644
 --- a/fs/aio.c
 +++ b/fs/aio.c
-@@ -1109,6 +1109,11 @@ static inline void iocb_destroy(struct aio_kiocb *iocb)
+@@ -1106,6 +1106,11 @@ static inline void iocb_destroy(struct aio_kiocb *iocb)
  	kmem_cache_free(kiocb_cachep, iocb);
  }
  
@@ -1120,7 +933,7 @@ index b0b17bd09..b3e14a9fe 100644
  /* aio_complete
   *	Called when the io request on the given iocb is complete.
   */
-@@ -1117,7 +1122,7 @@ static void aio_complete(struct aio_kiocb *iocb)
+@@ -1114,7 +1119,7 @@ static void aio_complete(struct aio_kiocb *iocb)
  	struct kioctx	*ctx = iocb->ki_ctx;
  	struct aio_ring	*ring;
  	struct io_event	*ev_page, *event;
@@ -1129,7 +942,7 @@ index b0b17bd09..b3e14a9fe 100644
  	unsigned long	flags;
  
  	/*
-@@ -1161,6 +1166,10 @@ static void aio_complete(struct aio_kiocb *iocb)
+@@ -1156,6 +1161,10 @@ static void aio_complete(struct aio_kiocb *iocb)
  	ctx->completed_events++;
  	if (ctx->completed_events > 1)
  		refill_reqs_available(ctx, head, tail);
@@ -1140,7 +953,7 @@ index b0b17bd09..b3e14a9fe 100644
  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
  
  	pr_debug("added to ring %p at [%u]\n", iocb, tail);
-@@ -1181,8 +1190,18 @@ static void aio_complete(struct aio_kiocb *iocb)
+@@ -1176,8 +1185,18 @@ static void aio_complete(struct aio_kiocb *iocb)
  	 */
  	smp_mb();
  
@@ -1161,20 +974,7 @@ index b0b17bd09..b3e14a9fe 100644
  }
  
  static inline void iocb_put(struct aio_kiocb *iocb)
-@@ -1250,10 +1269,10 @@ static long aio_read_events_ring(struct kioctx *ctx,
- 		avail = min(avail, nr - ret);
- 		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);
- 
--		ev = kmap(page);
-+		ev = kmap_local_page(page);
- 		copy_ret = copy_to_user(event + ret, ev + pos,
- 					sizeof(*ev) * avail);
--		kunmap(page);
-+		kunmap_local(ev);
- 
- 		if (unlikely(copy_ret)) {
- 			ret = -EFAULT;
-@@ -1298,7 +1317,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
+@@ -1290,7 +1309,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
  			struct io_event __user *event,
  			ktime_t until)
  {
@@ -1185,7 +985,7 @@ index b0b17bd09..b3e14a9fe 100644
  
  	/*
  	 * Note that aio_read_events() is being called as the conditional - i.e.
-@@ -1314,12 +1335,37 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
+@@ -1306,12 +1327,37 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
  	 * the ringbuffer empty. So in practice we should be ok, but it's
  	 * something to be aware of when touching this code.
  	 */
@@ -1231,10 +1031,10 @@ index b0b17bd09..b3e14a9fe 100644
  
 diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
 new file mode 100644
-index 000000000..6c698b3b3
+index 000000000..fb5b24f20
 --- /dev/null
 +++ b/fs/bcachefs/Kconfig
-@@ -0,0 +1,77 @@
+@@ -0,0 +1,76 @@
 +
 +config BCACHEFS_FS
 +	tristate "bcachefs filesystem support (EXPERIMENTAL)"
@@ -1256,7 +1056,6 @@ index 000000000..6c698b3b3
 +	select CRYPTO_CHACHA20
 +	select CRYPTO_POLY1305
 +	select KEYS
-+	select SIXLOCKS
 +	select RAID6_PQ
 +	select XOR_BLOCKS
 +	select XXHASH
@@ -1314,10 +1113,10 @@ index 000000000..6c698b3b3
 +	This disables device latency tracking and time stats, only for performance testing
 diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
 new file mode 100644
-index 000000000..a71956048
+index 000000000..c87be5fb7
 --- /dev/null
 +++ b/fs/bcachefs/Makefile
-@@ -0,0 +1,74 @@
+@@ -0,0 +1,83 @@
 +
 +obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
 +
@@ -1333,10 +1132,12 @@ index 000000000..a71956048
 +	btree_gc.o		\
 +	btree_io.o		\
 +	btree_iter.o		\
++	btree_journal_iter.o	\
 +	btree_key_cache.o	\
 +	btree_locking.o		\
++	btree_trans_commit.o	\
++	btree_update.o		\
 +	btree_update_interior.o	\
-+	btree_update_leaf.o	\
 +	btree_write_buffer.o	\
 +	buckets.o		\
 +	buckets_waiting_for_journal.o	\
@@ -1358,6 +1159,9 @@ index 000000000..a71956048
 +	fs-common.o		\
 +	fs-ioctl.o		\
 +	fs-io.o			\
++	fs-io-buffered.o	\
++	fs-io-direct.o		\
++	fs-io-pagecache.o	\
 +	fsck.o			\
 +	inode.o			\
 +	io.o			\
@@ -1379,7 +1183,11 @@ index 000000000..a71956048
 +	recovery.o		\
 +	reflink.o		\
 +	replicas.o		\
++	sb-clean.o		\
++	sb-members.o		\
 +	siphash.o		\
++	six.o			\
++	snapshot.o		\
 +	subvolume.o		\
 +	super.o			\
 +	super-io.o		\
@@ -1876,10 +1684,10 @@ index 000000000..bb21d8d69
 +#endif /* _BCACHEFS_ACL_H */
 diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
 new file mode 100644
-index 000000000..8d8481fc1
+index 000000000..540d94c0c
 --- /dev/null
 +++ b/fs/bcachefs/alloc_background.c
-@@ -0,0 +1,2209 @@
+@@ -0,0 +1,2157 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "alloc_background.h"
@@ -1961,36 +1769,6 @@ index 000000000..8d8481fc1
 +	return v;
 +}
 +
-+static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
-+				      unsigned field, u64 v)
-+{
-+	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
-+
-+	if (!v)
-+		return;
-+
-+	a->v.fields |= 1 << field;
-+
-+	switch (bytes) {
-+	case 1:
-+		*((u8 *) *p) = v;
-+		break;
-+	case 2:
-+		*((__le16 *) *p) = cpu_to_le16(v);
-+		break;
-+	case 4:
-+		*((__le32 *) *p) = cpu_to_le32(v);
-+		break;
-+	case 8:
-+		*((__le64 *) *p) = cpu_to_le64(v);
-+		break;
-+	default:
-+		BUG();
-+	}
-+
-+	*p += bytes;
-+}
-+
 +static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
 +				 struct bkey_s_c k)
 +{
@@ -2149,10 +1927,9 @@ index 000000000..8d8481fc1
 +}
 +
 +int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
-+			  unsigned flags, struct printbuf *err)
++			  enum bkey_invalid_flags flags, struct printbuf *err)
 +{
 +	struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
-+	int rw = flags & WRITE;
 +
 +	if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) {
 +		prt_printf(err, "bad val size (%u > %lu)",
@@ -2166,71 +1943,50 @@ index 000000000..8d8481fc1
 +		return -BCH_ERR_invalid_bkey;
 +	}
 +
-+	if (rw == WRITE &&
-+	    !(flags & BKEY_INVALID_JOURNAL) &&
-+	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_btree_backpointers) {
-+		unsigned i, bp_len = 0;
-+
-+		for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
-+			bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len;
-+
-+		if (bp_len > a.v->dirty_sectors) {
-+			prt_printf(err, "too many backpointers");
-+			return -BCH_ERR_invalid_bkey;
-+		}
++	if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
++		prt_printf(err, "invalid data type (got %u should be %u)",
++		       a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
++		return -BCH_ERR_invalid_bkey;
 +	}
 +
-+	if (rw == WRITE) {
-+		if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
-+			prt_printf(err, "invalid data type (got %u should be %u)",
-+			       a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
++	switch (a.v->data_type) {
++	case BCH_DATA_free:
++	case BCH_DATA_need_gc_gens:
++	case BCH_DATA_need_discard:
++		if (a.v->dirty_sectors ||
++		    a.v->cached_sectors ||
++		    a.v->stripe) {
++			prt_printf(err, "empty data type free but have data");
++			return -BCH_ERR_invalid_bkey;
++		}
++		break;
++	case BCH_DATA_sb:
++	case BCH_DATA_journal:
++	case BCH_DATA_btree:
++	case BCH_DATA_user:
++	case BCH_DATA_parity:
++		if (!a.v->dirty_sectors) {
++			prt_printf(err, "data_type %s but dirty_sectors==0",
++			       bch2_data_types[a.v->data_type]);
++			return -BCH_ERR_invalid_bkey;
++		}
++		break;
++	case BCH_DATA_cached:
++		if (!a.v->cached_sectors ||
++		    a.v->dirty_sectors ||
++		    a.v->stripe) {
++			prt_printf(err, "data type inconsistency");
 +			return -BCH_ERR_invalid_bkey;
 +		}
 +
-+		switch (a.v->data_type) {
-+		case BCH_DATA_free:
-+		case BCH_DATA_need_gc_gens:
-+		case BCH_DATA_need_discard:
-+			if (a.v->dirty_sectors ||
-+			    a.v->cached_sectors ||
-+			    a.v->stripe) {
-+				prt_printf(err, "empty data type free but have data");
-+				return -BCH_ERR_invalid_bkey;
-+			}
-+			break;
-+		case BCH_DATA_sb:
-+		case BCH_DATA_journal:
-+		case BCH_DATA_btree:
-+		case BCH_DATA_user:
-+		case BCH_DATA_parity:
-+			if (!a.v->dirty_sectors) {
-+				prt_printf(err, "data_type %s but dirty_sectors==0",
-+				       bch2_data_types[a.v->data_type]);
-+				return -BCH_ERR_invalid_bkey;
-+			}
-+			break;
-+		case BCH_DATA_cached:
-+			if (!a.v->cached_sectors ||
-+			    a.v->dirty_sectors ||
-+			    a.v->stripe) {
-+				prt_printf(err, "data type inconsistency");
-+				return -BCH_ERR_invalid_bkey;
-+			}
-+
-+			if (!a.v->io_time[READ] &&
-+			    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) {
-+				prt_printf(err, "cached bucket with read_time == 0");
-+				return -BCH_ERR_invalid_bkey;
-+			}
-+			break;
-+		case BCH_DATA_stripe:
-+			if (!a.v->stripe) {
-+				prt_printf(err, "data_type %s but stripe==0",
-+				       bch2_data_types[a.v->data_type]);
-+				return -BCH_ERR_invalid_bkey;
-+			}
-+			break;
++		if (!a.v->io_time[READ] &&
++		    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) {
++			prt_printf(err, "cached bucket with read_time == 0");
++			return -BCH_ERR_invalid_bkey;
 +		}
++		break;
++	case BCH_DATA_stripe:
++		break;
 +	}
 +
 +	return 0;
@@ -3216,7 +2972,7 @@ index 000000000..8d8481fc1
 +					    struct btree_iter *iter,
 +					    struct bpos end)
 +{
-+	if (!btree_node_type_is_extents(iter->btree_id)) {
++	if (!btree_id_is_extents(iter->btree_id)) {
 +		return __bch2_check_discard_freespace_key(trans, iter);
 +	} else {
 +		int ret;
@@ -4354,10 +4110,10 @@ index 000000000..c0914feb5
 +#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
 diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
 new file mode 100644
-index 000000000..fcb7311b1
+index 000000000..e02749ddc
 --- /dev/null
 +++ b/fs/bcachefs/alloc_foreground.c
-@@ -0,0 +1,1536 @@
+@@ -0,0 +1,1571 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright 2012 Google, Inc.
@@ -5349,7 +5105,6 @@ index 000000000..fcb7311b1
 +			cl = _cl;
 +			goto retry_blocking;
 +		}
-+
 +	}
 +
 +	return ret;
@@ -5391,6 +5146,16 @@ index 000000000..fcb7311b1
 +	return ret < 0 ? ret : 0;
 +}
 +
++/**
++ * should_drop_bucket - check if this is open_bucket should go away
++ * @ca:		if set, we're killing buckets for a particular device
++ * @ec:		if true, we're shutting down erasure coding and killing all ec
++ *		open_buckets
++ *		otherwise, return true
++ *
++ * We're killing open_buckets because we're shutting down a device, erasure
++ * coding, or the entire filesystem - check if this open_bucket matches:
++ */
 +static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
 +			       struct bch_dev *ca, bool ec)
 +{
@@ -5402,8 +5167,12 @@ index 000000000..fcb7311b1
 +		unsigned i;
 +
 +		if (!drop && ob->ec) {
++			unsigned nr_blocks;
++
 +			mutex_lock(&ob->ec->lock);
-+			for (i = 0; i < ob->ec->new_stripe.key.v.nr_blocks; i++) {
++			nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks;
++
++			for (i = 0; i < nr_blocks; i++) {
 +				if (!ob->ec->blocks[i])
 +					continue;
 +
@@ -5872,31 +5641,53 @@ index 000000000..fcb7311b1
 +	NULL
 +};
 +
++static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
++				     struct write_point *wp)
++{
++	struct open_bucket *ob;
++	unsigned i;
++
++	prt_printf(out, "%lu: ", wp->write_point);
++	prt_human_readable_u64(out, wp->sectors_allocated);
++
++	prt_printf(out, " last wrote: ");
++	bch2_pr_time_units(out, sched_clock() - wp->last_used);
++
++	for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
++		prt_printf(out, " %s: ", bch2_write_point_states[i]);
++		bch2_pr_time_units(out, wp->time[i]);
++	}
++
++	prt_newline(out);
++
++	printbuf_indent_add(out, 2);
++	open_bucket_for_each(c, &wp->ptrs, ob, i)
++		bch2_open_bucket_to_text(out, c, ob);
++	printbuf_indent_sub(out, 2);
++}
++
 +void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
 +{
 +	struct write_point *wp;
-+	unsigned i;
 +
++	prt_str(out, "Foreground write points\n");
 +	for (wp = c->write_points;
 +	     wp < c->write_points + ARRAY_SIZE(c->write_points);
-+	     wp++) {
-+		prt_printf(out, "%lu: ", wp->write_point);
-+		prt_human_readable_u64(out, wp->sectors_allocated);
++	     wp++)
++		bch2_write_point_to_text(out, c, wp);
 +
-+		prt_printf(out, " last wrote: ");
-+		bch2_pr_time_units(out, sched_clock() - wp->last_used);
++	prt_str(out, "Copygc write point\n");
++	bch2_write_point_to_text(out, c, &c->copygc_write_point);
 +
-+		for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
-+			prt_printf(out, " %s: ", bch2_write_point_states[i]);
-+			bch2_pr_time_units(out, wp->time[i]);
-+		}
++	prt_str(out, "Rebalance write point\n");
++	bch2_write_point_to_text(out, c, &c->rebalance_write_point);
 +
-+		prt_newline(out);
-+	}
++	prt_str(out, "Btree write point\n");
++	bch2_write_point_to_text(out, c, &c->btree_write_point);
 +}
 diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
 new file mode 100644
-index 000000000..fee195f7e
+index 000000000..7aaeec44c
 --- /dev/null
 +++ b/fs/bcachefs/alloc_foreground.h
 @@ -0,0 +1,224 @@
@@ -5907,7 +5698,7 @@ index 000000000..fee195f7e
 +#include "bcachefs.h"
 +#include "alloc_types.h"
 +#include "extents.h"
-+#include "super.h"
++#include "sb-members.h"
 +
 +#include <linux/hash.h>
 +
@@ -6126,7 +5917,7 @@ index 000000000..fee195f7e
 +#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
 diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
 new file mode 100644
-index 000000000..c33a29954
+index 000000000..b91b7a461
 --- /dev/null
 +++ b/fs/bcachefs/alloc_types.h
 @@ -0,0 +1,126 @@
@@ -6164,7 +5955,7 @@ index 000000000..c33a29954
 +};
 +
 +#define BCH_WATERMARK_BITS	3
-+#define BCH_WATERMARK_MASK	~(~0 << BCH_WATERMARK_BITS)
++#define BCH_WATERMARK_MASK	~(~0U << BCH_WATERMARK_BITS)
 +
 +#define OPEN_BUCKETS_COUNT	1024
 +
@@ -6237,7 +6028,7 @@ index 000000000..c33a29954
 +		struct dev_stripe_state	stripe;
 +
 +		u64			sectors_allocated;
-+	} __attribute__((__aligned__(SMP_CACHE_BYTES)));
++	} __aligned(SMP_CACHE_BYTES);
 +
 +	struct {
 +		struct work_struct	index_update_work;
@@ -6248,7 +6039,7 @@ index 000000000..c33a29954
 +		enum write_point_state	state;
 +		u64			last_state_change;
 +		u64			time[WRITE_POINT_STATE_NR];
-+	} __attribute__((__aligned__(SMP_CACHE_BYTES)));
++	} __aligned(SMP_CACHE_BYTES);
 +};
 +
 +struct write_point_specifier {
@@ -7328,10 +7119,10 @@ index 000000000..1fbed1f83
 +#endif /* _BCACHEFS_BBPOS_H */
 diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
 new file mode 100644
-index 000000000..82b0706a8
+index 000000000..30b3d7b9f
 --- /dev/null
 +++ b/fs/bcachefs/bcachefs.h
-@@ -0,0 +1,1201 @@
+@@ -0,0 +1,1146 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_H
 +#define _BCACHEFS_H
@@ -7542,6 +7333,7 @@ index 000000000..82b0706a8
 +#include "fifo.h"
 +#include "nocow_locking_types.h"
 +#include "opts.h"
++#include "recovery_types.h"
 +#include "seqmutex.h"
 +#include "util.h"
 +
@@ -7627,8 +7419,8 @@ index 000000000..82b0706a8
 +
 +#define bch_err_fn(_c, _ret)						\
 +	 bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret))
-+#define bch_err_msg(_c, _ret, _msg)					\
-+	 bch_err(_c, "%s(): error " _msg " %s", __func__, bch2_err_str(_ret))
++#define bch_err_msg(_c, _ret, _msg, ...)				\
++	 bch_err(_c, "%s(): error " _msg " %s", __func__, ##__VA_ARGS__, bch2_err_str(_ret))
 +
 +#define bch_verbose(c, fmt, ...)					\
 +do {									\
@@ -7786,6 +7578,7 @@ index 000000000..82b0706a8
 +	GC_PHASE_BTREE_backpointers,
 +	GC_PHASE_BTREE_bucket_gens,
 +	GC_PHASE_BTREE_snapshot_trees,
++	GC_PHASE_BTREE_deleted_inodes,
 +
 +	GC_PHASE_PENDING_DELETE,
 +};
@@ -7989,48 +7782,6 @@ index 000000000..82b0706a8
 +	BCH_WRITE_REF_NR,
 +};
 +
-+#define PASS_SILENT		BIT(0)
-+#define PASS_FSCK		BIT(1)
-+#define PASS_UNCLEAN		BIT(2)
-+#define PASS_ALWAYS		BIT(3)
-+
-+#define BCH_RECOVERY_PASSES()									\
-+	x(alloc_read,			PASS_ALWAYS)						\
-+	x(stripes_read,			PASS_ALWAYS)						\
-+	x(initialize_subvolumes,	0)							\
-+	x(snapshots_read,		PASS_ALWAYS)						\
-+	x(check_topology,		0)							\
-+	x(check_allocations,		PASS_FSCK)						\
-+	x(set_may_go_rw,		PASS_ALWAYS|PASS_SILENT)				\
-+	x(journal_replay,		PASS_ALWAYS)						\
-+	x(check_alloc_info,		PASS_FSCK)						\
-+	x(check_lrus,			PASS_FSCK)						\
-+	x(check_btree_backpointers,	PASS_FSCK)						\
-+	x(check_backpointers_to_extents,PASS_FSCK)						\
-+	x(check_extents_to_backpointers,PASS_FSCK)						\
-+	x(check_alloc_to_lru_refs,	PASS_FSCK)						\
-+	x(fs_freespace_init,		PASS_ALWAYS|PASS_SILENT)				\
-+	x(bucket_gens_init,		0)							\
-+	x(check_snapshot_trees,		PASS_FSCK)						\
-+	x(check_snapshots,		PASS_FSCK)						\
-+	x(check_subvols,		PASS_FSCK)						\
-+	x(delete_dead_snapshots,	PASS_FSCK|PASS_UNCLEAN)					\
-+	x(fs_upgrade_for_subvolumes,	0)							\
-+	x(check_inodes,			PASS_FSCK|PASS_UNCLEAN)					\
-+	x(check_extents,		PASS_FSCK)						\
-+	x(check_dirents,		PASS_FSCK)						\
-+	x(check_xattrs,			PASS_FSCK)						\
-+	x(check_root,			PASS_FSCK)						\
-+	x(check_directory_structure,	PASS_FSCK)						\
-+	x(check_nlinks,			PASS_FSCK)						\
-+	x(fix_reflink_p,		0)							\
-+
-+enum bch_recovery_pass {
-+#define x(n, when)	BCH_RECOVERY_PASS_##n,
-+	BCH_RECOVERY_PASSES()
-+#undef x
-+};
-+
 +struct bch_fs {
 +	struct closure		cl;
 +
@@ -8369,6 +8120,7 @@ index 000000000..82b0706a8
 +	enum bch_recovery_pass	curr_recovery_pass;
 +	/* bitmap of explicitly enabled recovery passes: */
 +	u64			recovery_passes_explicit;
++	u64			recovery_passes_complete;
 +
 +	/* DEBUG JUNK */
 +	struct dentry		*fs_debug_dir;
@@ -8513,32 +8265,16 @@ index 000000000..82b0706a8
 +	return dev < c->sb.nr_devices && c->devs[dev];
 +}
 +
-+/*
-+ * For when we need to rewind recovery passes and run a pass we skipped:
-+ */
-+static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
-+						  enum bch_recovery_pass pass)
-+{
-+	c->recovery_passes_explicit |= BIT_ULL(pass);
-+
-+	if (c->curr_recovery_pass >= pass) {
-+		c->curr_recovery_pass = pass;
-+		return -BCH_ERR_restart_recovery;
-+	} else {
-+		return 0;
-+	}
-+}
-+
 +#define BKEY_PADDED_ONSTACK(key, pad)				\
 +	struct { struct bkey_i key; __u64 key ## _pad[pad]; }
 +
 +#endif /* _BCACHEFS_H */
 diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
 new file mode 100644
-index 000000000..5c308f842
+index 000000000..f17238be4
 --- /dev/null
 +++ b/fs/bcachefs/bcachefs_format.h
-@@ -0,0 +1,2319 @@
+@@ -0,0 +1,2368 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_FORMAT_H
 +#define _BCACHEFS_FORMAT_H
@@ -9457,9 +9193,7 @@ index 000000000..5c308f842
 +#define DT_SUBVOL	16
 +#define BCH_DT_MAX	17
 +
-+#define BCH_NAME_MAX	((unsigned) (U8_MAX * sizeof(__u64) -		\
-+			 sizeof(struct bkey) -				\
-+			 offsetof(struct bch_dirent, d_name)))
++#define BCH_NAME_MAX	512
 +
 +/* Xattrs */
 +
@@ -9667,6 +9401,11 @@ index 000000000..5c308f842
 +	__le32			flags;
 +	__le32			snapshot;
 +	__le64			inode;
++	/*
++	 * Snapshot subvolumes form a tree, separate from the snapshot nodes
++	 * tree - if this subvolume is a snapshot, this is the ID of the
++	 * subvolume it was created from:
++	 */
 +	__le32			parent;
 +	__le32			pad;
 +	bch_le128		otime;
@@ -9688,6 +9427,7 @@ index 000000000..5c308f842
 +	__le32			parent;
 +	__le32			children[2];
 +	__le32			subvol;
++	/* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
 +	__le32			tree;
 +	__le32			depth;
 +	__le32			skip[3];
@@ -10170,7 +9910,9 @@ index 000000000..5c308f842
 +	x(major_minor,			BCH_VERSION(1,  0),		\
 +	  0)								\
 +	x(snapshot_skiplists,		BCH_VERSION(1,  1),		\
-+	  BIT_ULL(BCH_RECOVERY_PASS_check_snapshots))
++	  BIT_ULL(BCH_RECOVERY_PASS_check_snapshots))			\
++	x(deleted_inodes,		BCH_VERSION(1,  2),		\
++	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes))
 +
 +enum bcachefs_metadata_version {
 +	bcachefs_metadata_version_min = 9,
@@ -10679,7 +10421,7 @@ index 000000000..5c308f842
 +	__le64			_buckets_unavailable; /* No longer used */
 +
 +	struct jset_entry_dev_usage_type d[];
-+} __packed;
++};
 +
 +static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
 +{
@@ -10735,26 +10477,69 @@ index 000000000..5c308f842
 +
 +/* Btree: */
 +
-+#define BCH_BTREE_IDS()				\
-+	x(extents,		0)		\
-+	x(inodes,		1)		\
-+	x(dirents,		2)		\
-+	x(xattrs,		3)		\
-+	x(alloc,		4)		\
-+	x(quotas,		5)		\
-+	x(stripes,		6)		\
-+	x(reflink,		7)		\
-+	x(subvolumes,		8)		\
-+	x(snapshots,		9)		\
-+	x(lru,			10)		\
-+	x(freespace,		11)		\
-+	x(need_discard,		12)		\
-+	x(backpointers,		13)		\
-+	x(bucket_gens,		14)		\
-+	x(snapshot_trees,	15)
++enum btree_id_flags {
++	BTREE_ID_EXTENTS	= BIT(0),
++	BTREE_ID_SNAPSHOTS	= BIT(1),
++	BTREE_ID_DATA		= BIT(2),
++};
++
++#define BCH_BTREE_IDS()								\
++	x(extents,		0,	BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\
++	  BIT_ULL(KEY_TYPE_whiteout)|						\
++	  BIT_ULL(KEY_TYPE_error)|						\
++	  BIT_ULL(KEY_TYPE_cookie)|						\
++	  BIT_ULL(KEY_TYPE_extent)|						\
++	  BIT_ULL(KEY_TYPE_reservation)|					\
++	  BIT_ULL(KEY_TYPE_reflink_p)|						\
++	  BIT_ULL(KEY_TYPE_inline_data))					\
++	x(inodes,		1,	BTREE_ID_SNAPSHOTS,			\
++	  BIT_ULL(KEY_TYPE_whiteout)|						\
++	  BIT_ULL(KEY_TYPE_inode)|						\
++	  BIT_ULL(KEY_TYPE_inode_v2)|						\
++	  BIT_ULL(KEY_TYPE_inode_v3)|						\
++	  BIT_ULL(KEY_TYPE_inode_generation))					\
++	x(dirents,		2,	BTREE_ID_SNAPSHOTS,			\
++	  BIT_ULL(KEY_TYPE_whiteout)|						\
++	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
++	  BIT_ULL(KEY_TYPE_dirent))						\
++	x(xattrs,		3,	BTREE_ID_SNAPSHOTS,			\
++	  BIT_ULL(KEY_TYPE_whiteout)|						\
++	  BIT_ULL(KEY_TYPE_cookie)|						\
++	  BIT_ULL(KEY_TYPE_hash_whiteout)|					\
++	  BIT_ULL(KEY_TYPE_xattr))						\
++	x(alloc,		4,	0,					\
++	  BIT_ULL(KEY_TYPE_alloc)|						\
++	  BIT_ULL(KEY_TYPE_alloc_v2)|						\
++	  BIT_ULL(KEY_TYPE_alloc_v3)|						\
++	  BIT_ULL(KEY_TYPE_alloc_v4))						\
++	x(quotas,		5,	0,					\
++	  BIT_ULL(KEY_TYPE_quota))						\
++	x(stripes,		6,	0,					\
++	  BIT_ULL(KEY_TYPE_stripe))						\
++	x(reflink,		7,	BTREE_ID_EXTENTS|BTREE_ID_DATA,		\
++	  BIT_ULL(KEY_TYPE_reflink_v)|						\
++	  BIT_ULL(KEY_TYPE_indirect_inline_data))				\
++	x(subvolumes,		8,	0,					\
++	  BIT_ULL(KEY_TYPE_subvolume))						\
++	x(snapshots,		9,	0,					\
++	  BIT_ULL(KEY_TYPE_snapshot))						\
++	x(lru,			10,	0,					\
++	  BIT_ULL(KEY_TYPE_set))						\
++	x(freespace,		11,	BTREE_ID_EXTENTS,			\
++	  BIT_ULL(KEY_TYPE_set))						\
++	x(need_discard,		12,	0,					\
++	  BIT_ULL(KEY_TYPE_set))						\
++	x(backpointers,		13,	0,					\
++	  BIT_ULL(KEY_TYPE_backpointer))					\
++	x(bucket_gens,		14,	0,					\
++	  BIT_ULL(KEY_TYPE_bucket_gens))					\
++	x(snapshot_trees,	15,	0,					\
++	  BIT_ULL(KEY_TYPE_snapshot_tree))					\
++	x(deleted_inodes,	16,	BTREE_ID_SNAPSHOTS,			\
++	  BIT_ULL(KEY_TYPE_set))
 +
 +enum btree_id {
-+#define x(kwd, val) BTREE_ID_##kwd = val,
++#define x(name, nr, ...) BTREE_ID_##name = nr,
 +	BCH_BTREE_IDS()
 +#undef x
 +	BTREE_ID_NR
@@ -11234,10 +11019,10 @@ index 000000000..f05881f7e
 +#endif /* _BCACHEFS_IOCTL_H */
 diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
 new file mode 100644
-index 000000000..ee7ba700e
+index 000000000..0a5bfe6e9
 --- /dev/null
 +++ b/fs/bcachefs/bkey.c
-@@ -0,0 +1,1063 @@
+@@ -0,0 +1,1107 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -11247,14 +11032,6 @@ index 000000000..ee7ba700e
 +#include "bset.h"
 +#include "util.h"
 +
-+#undef EBUG_ON
-+
-+#ifdef DEBUG_BKEYS
-+#define EBUG_ON(cond)		BUG_ON(cond)
-+#else
-+#define EBUG_ON(cond)
-+#endif
-+
 +const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
 +
 +void bch2_bkey_packed_to_binary_text(struct printbuf *out,
@@ -11425,6 +11202,28 @@ index 000000000..ee7ba700e
 +}
 +
 +__always_inline
++static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
++{
++	unsigned bits = state->format->bits_per_field[field];
++
++	if (bits) {
++		if (bits > state->bits) {
++			bits -= state->bits;
++			/* avoid shift by 64 if bits is 64 - bits is never 0 here: */
++			state->w |= (v >> 1) >> (bits - 1);
++
++			*state->p = state->w;
++			state->p = next_word(state->p);
++			state->w = 0;
++			state->bits = 64;
++		}
++
++		state->bits -= bits;
++		state->w |= v << state->bits;
++	}
++}
++
++__always_inline
 +static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
 +{
 +	unsigned bits = state->format->bits_per_field[field];
@@ -11438,20 +11237,7 @@ index 000000000..ee7ba700e
 +	if (fls64(v) > bits)
 +		return false;
 +
-+	if (bits > state->bits) {
-+		bits -= state->bits;
-+		/* avoid shift by 64 if bits is 0 - bits is never 64 here: */
-+		state->w |= (v >> 1) >> (bits - 1);
-+
-+		*state->p = state->w;
-+		state->p = next_word(state->p);
-+		state->w = 0;
-+		state->bits = 64;
-+	}
-+
-+	state->bits -= bits;
-+	state->w |= v << state->bits;
-+
++	__set_inc_field(state, field, v);
 +	return true;
 +}
 +
@@ -11620,19 +11406,7 @@ index 000000000..ee7ba700e
 +		ret = false;
 +	}
 +
-+	if (bits > state->bits) {
-+		bits -= state->bits;
-+		state->w |= (v >> 1) >> (bits - 1);
-+
-+		*state->p = state->w;
-+		state->p = next_word(state->p);
-+		state->w = 0;
-+		state->bits = 64;
-+	}
-+
-+	state->bits -= bits;
-+	state->w |= v << state->bits;
-+
++	__set_inc_field(state, field, v);
 +	return ret;
 +}
 +
@@ -11675,6 +11449,24 @@ index 000000000..ee7ba700e
 +
 +	return false;
 +}
++
++static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
++{
++	for (unsigned i = 0; i < f->nr_fields; i++) {
++		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
++		u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
++		u64 packed_max = f->bits_per_field[i]
++			? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
++			: 0;
++		u64 field_offset = le64_to_cpu(f->field_offset[i]);
++
++		if (packed_max + field_offset < packed_max ||
++		    packed_max + field_offset > unpacked_max)
++			return true;
++	}
++
++	return false;
++}
 +#endif
 +
 +/*
@@ -11755,7 +11547,8 @@ index 000000000..ee7ba700e
 +
 +		BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
 +		BUG_ON(bkey_packed_successor(&successor, b, *out) &&
-+		       bkey_cmp_left_packed(b, &successor, &orig) < 0);
++		       bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
++		       !bkey_format_has_too_big_fields(f));
 +	}
 +#endif
 +
@@ -11823,8 +11616,10 @@ index 000000000..ee7ba700e
 +
 +	/* allow for extent merging: */
 +	if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
-+		ret.bits_per_field[BKEY_FIELD_SIZE] += 4;
-+		bits += 4;
++		unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]);
++
++		ret.bits_per_field[BKEY_FIELD_SIZE] += b;
++		bits += b;
 +	}
 +
 +	ret.key_u64s = DIV_ROUND_UP(bits, 64);
@@ -11844,40 +11639,74 @@ index 000000000..ee7ba700e
 +		}
 +	}
 +
-+	EBUG_ON(bch2_bkey_format_validate(&ret));
++#ifdef CONFIG_BCACHEFS_DEBUG
++	{
++		struct printbuf buf = PRINTBUF;
++
++		BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
++		printbuf_exit(&buf);
++	}
++#endif
 +	return ret;
 +}
 +
-+const char *bch2_bkey_format_validate(struct bkey_format *f)
++int bch2_bkey_format_invalid(struct bch_fs *c,
++			     struct bkey_format *f,
++			     enum bkey_invalid_flags flags,
++			     struct printbuf *err)
 +{
 +	unsigned i, bits = KEY_PACKED_BITS_START;
 +
-+	if (f->nr_fields != BKEY_NR_FIELDS)
-+		return "incorrect number of fields";
++	if (f->nr_fields != BKEY_NR_FIELDS) {
++		prt_printf(err, "incorrect number of fields: got %u, should be %u",
++			   f->nr_fields, BKEY_NR_FIELDS);
++		return -BCH_ERR_invalid;
++	}
 +
 +	/*
 +	 * Verify that the packed format can't represent fields larger than the
 +	 * unpacked format:
 +	 */
 +	for (i = 0; i < f->nr_fields; i++) {
-+		unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-+		u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-+		u64 packed_max = f->bits_per_field[i]
-+			? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
-+			: 0;
-+		u64 field_offset = le64_to_cpu(f->field_offset[i]);
++		if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) {
++			unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
++			u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
++			u64 packed_max = f->bits_per_field[i]
++				? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
++				: 0;
++			u64 field_offset = le64_to_cpu(f->field_offset[i]);
 +
-+		if (packed_max + field_offset < packed_max ||
-+		    packed_max + field_offset > unpacked_max)
-+			return "field too large";
++			if (packed_max + field_offset < packed_max ||
++			    packed_max + field_offset > unpacked_max) {
++				prt_printf(err, "field %u too large: %llu + %llu > %llu",
++					   i, packed_max, field_offset, unpacked_max);
++				return -BCH_ERR_invalid;
++			}
++		}
 +
 +		bits += f->bits_per_field[i];
 +	}
 +
-+	if (f->key_u64s != DIV_ROUND_UP(bits, 64))
-+		return "incorrect key_u64s";
++	if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
++		prt_printf(err, "incorrect key_u64s: got %u, should be %u",
++			   f->key_u64s, DIV_ROUND_UP(bits, 64));
++		return -BCH_ERR_invalid;
++	}
 +
-+	return NULL;
++	return 0;
++}
++
++void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
++{
++	prt_printf(out, "u64s %u fields ", f->key_u64s);
++
++	for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
++		if (i)
++			prt_str(out, ", ");
++		prt_printf(out, "%u:%llu",
++			   f->bits_per_field[i],
++			   le64_to_cpu(f->field_offset[i]));
++	}
 +}
 +
 +/*
@@ -12303,10 +12132,10 @@ index 000000000..ee7ba700e
 +#endif
 diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
 new file mode 100644
-index 000000000..e81fb3e00
+index 000000000..51969a462
 --- /dev/null
 +++ b/fs/bcachefs/bkey.h
-@@ -0,0 +1,774 @@
+@@ -0,0 +1,782 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BKEY_H
 +#define _BCACHEFS_BKEY_H
@@ -12318,6 +12147,12 @@ index 000000000..e81fb3e00
 +#include "util.h"
 +#include "vstructs.h"
 +
++enum bkey_invalid_flags {
++	BKEY_INVALID_WRITE		= (1U << 0),
++	BKEY_INVALID_COMMIT		= (1U << 1),
++	BKEY_INVALID_JOURNAL		= (1U << 2),
++};
++
 +#if 0
 +
 +/*
@@ -13078,7 +12913,9 @@ index 000000000..e81fb3e00
 +
 +void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
 +struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
-+const char *bch2_bkey_format_validate(struct bkey_format *);
++int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
++			     enum bkey_invalid_flags, struct printbuf *);
++void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
 +
 +#endif /* _BCACHEFS_BKEY_H */
 diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
@@ -13285,10 +13122,10 @@ index 000000000..5f42a6e69
 +#endif /* _BCACHEFS_BKEY_CMP_H */
 diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
 new file mode 100644
-index 000000000..1381166bf
+index 000000000..6547142db
 --- /dev/null
 +++ b/fs/bcachefs/bkey_methods.c
-@@ -0,0 +1,519 @@
+@@ -0,0 +1,456 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -13304,6 +13141,7 @@ index 000000000..1381166bf
 +#include "lru.h"
 +#include "quota.h"
 +#include "reflink.h"
++#include "snapshot.h"
 +#include "subvolume.h"
 +#include "xattr.h"
 +
@@ -13431,78 +13269,14 @@ index 000000000..1381166bf
 +	return ops->key_invalid(c, k, flags, err);
 +}
 +
-+static unsigned bch2_key_types_allowed[] = {
-+	[BKEY_TYPE_extents] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_whiteout)|
-+		(1U << KEY_TYPE_error)|
-+		(1U << KEY_TYPE_cookie)|
-+		(1U << KEY_TYPE_extent)|
-+		(1U << KEY_TYPE_reservation)|
-+		(1U << KEY_TYPE_reflink_p)|
-+		(1U << KEY_TYPE_inline_data),
-+	[BKEY_TYPE_inodes] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_whiteout)|
-+		(1U << KEY_TYPE_inode)|
-+		(1U << KEY_TYPE_inode_v2)|
-+		(1U << KEY_TYPE_inode_v3)|
-+		(1U << KEY_TYPE_inode_generation),
-+	[BKEY_TYPE_dirents] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_whiteout)|
-+		(1U << KEY_TYPE_hash_whiteout)|
-+		(1U << KEY_TYPE_dirent),
-+	[BKEY_TYPE_xattrs] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_whiteout)|
-+		(1U << KEY_TYPE_cookie)|
-+		(1U << KEY_TYPE_hash_whiteout)|
-+		(1U << KEY_TYPE_xattr),
-+	[BKEY_TYPE_alloc] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_alloc)|
-+		(1U << KEY_TYPE_alloc_v2)|
-+		(1U << KEY_TYPE_alloc_v3)|
-+		(1U << KEY_TYPE_alloc_v4),
-+	[BKEY_TYPE_quotas] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_quota),
-+	[BKEY_TYPE_stripes] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_stripe),
-+	[BKEY_TYPE_reflink] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_reflink_v)|
-+		(1U << KEY_TYPE_indirect_inline_data),
-+	[BKEY_TYPE_subvolumes] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_subvolume),
-+	[BKEY_TYPE_snapshots] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_snapshot),
-+	[BKEY_TYPE_lru] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_set),
-+	[BKEY_TYPE_freespace] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_set),
-+	[BKEY_TYPE_need_discard] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_set),
-+	[BKEY_TYPE_backpointers] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_backpointer),
-+	[BKEY_TYPE_bucket_gens] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_bucket_gens),
-+	[BKEY_TYPE_snapshot_trees] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_snapshot_tree),
++static u64 bch2_key_types_allowed[] = {
++#define x(name, nr, flags, keys)	[BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
++	BCH_BTREE_IDS()
++#undef x
 +	[BKEY_TYPE_btree] =
-+		(1U << KEY_TYPE_deleted)|
-+		(1U << KEY_TYPE_btree_ptr)|
-+		(1U << KEY_TYPE_btree_ptr_v2),
++		BIT_ULL(KEY_TYPE_deleted)|
++		BIT_ULL(KEY_TYPE_btree_ptr)|
++		BIT_ULL(KEY_TYPE_btree_ptr_v2),
 +};
 +
 +int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -13516,7 +13290,7 @@ index 000000000..1381166bf
 +	}
 +
 +	if (flags & BKEY_INVALID_COMMIT	 &&
-+	    !(bch2_key_types_allowed[type] & (1U << k.k->type))) {
++	    !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type))) {
 +		prt_printf(err, "invalid key type for btree %s (%s)",
 +			   bch2_btree_ids[type], bch2_bkey_types[k.k->type]);
 +		return -BCH_ERR_invalid_bkey;
@@ -13810,10 +13584,10 @@ index 000000000..1381166bf
 +}
 diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
 new file mode 100644
-index 000000000..f4e60d2e6
+index 000000000..668f595e2
 --- /dev/null
 +++ b/fs/bcachefs/bkey_methods.h
-@@ -0,0 +1,193 @@
+@@ -0,0 +1,188 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BKEY_METHODS_H
 +#define _BCACHEFS_BKEY_METHODS_H
@@ -13829,12 +13603,6 @@ index 000000000..f4e60d2e6
 +extern const char * const bch2_bkey_types[];
 +extern const struct bkey_ops bch2_bkey_null_ops;
 +
-+enum bkey_invalid_flags {
-+	BKEY_INVALID_WRITE		= (1U << 0),
-+	BKEY_INVALID_COMMIT		= (1U << 1),
-+	BKEY_INVALID_JOURNAL		= (1U << 2),
-+};
-+
 +/*
 + * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
 + * invalid, entire key will be deleted.
@@ -13871,11 +13639,12 @@ index 000000000..f4e60d2e6
 +		: &bch2_bkey_null_ops;
 +}
 +
-+int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *);
-+int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
-+			enum btree_node_type, unsigned, struct printbuf *);
-+int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
-+		      enum btree_node_type, unsigned, struct printbuf *);
++int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c,
++			  enum bkey_invalid_flags, struct printbuf *);
++int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
++			enum bkey_invalid_flags, struct printbuf *);
++int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
++		      enum bkey_invalid_flags, struct printbuf *);
 +int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
 +
 +void bch2_bpos_to_text(struct printbuf *, struct bpos);
@@ -16406,10 +16175,10 @@ index 000000000..632c2b8c5
 +#endif /* _BCACHEFS_BSET_H */
 diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
 new file mode 100644
-index 000000000..13c88d953
+index 000000000..a8283fdc7
 --- /dev/null
 +++ b/fs/bcachefs/btree_cache.c
-@@ -0,0 +1,1277 @@
+@@ -0,0 +1,1274 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -17626,7 +17395,6 @@ index 000000000..13c88d953
 +void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
 +			     const struct btree *b)
 +{
-+	const struct bkey_format *f = &b->format;
 +	struct bset_stats stats;
 +
 +	memset(&stats, 0, sizeof(stats));
@@ -17640,9 +17408,13 @@ index 000000000..13c88d953
 +	prt_printf(out, ":\n"
 +	       "    ptrs: ");
 +	bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
++	prt_newline(out);
 +
-+	prt_printf(out, "\n"
-+	       "    format: u64s %u fields %u %u %u %u %u\n"
++	prt_printf(out,
++	       "    format: ");
++	bch2_bkey_format_to_text(out, &b->format);
++
++	prt_printf(out,
 +	       "    unpack fn len: %u\n"
 +	       "    bytes used %zu/%zu (%zu%% full)\n"
 +	       "    sib u64s: %u, %u (merge threshold %u)\n"
@@ -17650,12 +17422,6 @@ index 000000000..13c88d953
 +	       "    nr unpacked keys %u\n"
 +	       "    floats %zu\n"
 +	       "    failed unpacked %zu\n",
-+	       f->key_u64s,
-+	       f->bits_per_field[0],
-+	       f->bits_per_field[1],
-+	       f->bits_per_field[2],
-+	       f->bits_per_field[3],
-+	       f->bits_per_field[4],
 +	       b->unpack_fn_len,
 +	       b->nr.live_u64s * sizeof(u64),
 +	       btree_bytes(c) - sizeof(struct btree_node),
@@ -17825,10 +17591,10 @@ index 000000000..00c9b9218
 +#endif /* _BCACHEFS_BTREE_CACHE_H */
 diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
 new file mode 100644
-index 000000000..49e9822dd
+index 000000000..83dcd9eb2
 --- /dev/null
 +++ b/fs/bcachefs/btree_gc.c
-@@ -0,0 +1,2126 @@
+@@ -0,0 +1,2127 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
@@ -17840,6 +17606,7 @@ index 000000000..49e9822dd
 +#include "alloc_foreground.h"
 +#include "bkey_methods.h"
 +#include "bkey_buf.h"
++#include "btree_journal_iter.h"
 +#include "btree_key_cache.h"
 +#include "btree_locking.h"
 +#include "btree_update_interior.h"
@@ -17874,7 +17641,7 @@ index 000000000..49e9822dd
 +static bool should_restart_for_topology_repair(struct bch_fs *c)
 +{
 +	return c->opts.fix_errors != FSCK_FIX_no &&
-+		!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
++		!(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
 +}
 +
 +static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
@@ -18366,7 +18133,7 @@ index 000000000..49e9822dd
 +
 +	bch2_trans_init(&trans, c, 0, 0);
 +
-+	for (i = 0; i < btree_id_nr_alive(c)&& !ret; i++) {
++	for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
 +		struct btree_root *r = bch2_btree_id_root(c, i);
 +
 +		if (!r->alive)
@@ -19957,14 +19724,15 @@ index 000000000..49e9822dd
 +}
 diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
 new file mode 100644
-index 000000000..402c69184
+index 000000000..607575f83
 --- /dev/null
 +++ b/fs/bcachefs/btree_gc.h
-@@ -0,0 +1,113 @@
+@@ -0,0 +1,114 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_GC_H
 +#define _BCACHEFS_BTREE_GC_H
 +
++#include "bkey.h"
 +#include "btree_types.h"
 +
 +int bch2_check_topology(struct bch_fs *);
@@ -20014,7 +19782,7 @@ index 000000000..402c69184
 +static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
 +{
 +	switch (id) {
-+#define x(name, v) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
++#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
 +	BCH_BTREE_IDS()
 +#undef x
 +	default:
@@ -20076,10 +19844,10 @@ index 000000000..402c69184
 +#endif /* _BCACHEFS_BTREE_GC_H */
 diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
 new file mode 100644
-index 000000000..c049876ee
+index 000000000..cba3c081b
 --- /dev/null
 +++ b/fs/bcachefs/btree_io.c
-@@ -0,0 +1,2267 @@
+@@ -0,0 +1,2245 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -20099,6 +19867,7 @@ index 000000000..c049876ee
 +#include "io.h"
 +#include "journal_reclaim.h"
 +#include "journal_seq_blacklist.h"
++#include "recovery.h"
 +#include "super-io.h"
 +#include "trace.h"
 +
@@ -20187,8 +19956,8 @@ index 000000000..c049876ee
 +		vpfree(p, size);
 +}
 +
-+static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size,
-+				       bool *used_mempool)
++static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
++				bool *used_mempool)
 +{
 +	unsigned flags = memalloc_nofs_save();
 +	void *p;
@@ -20196,7 +19965,7 @@ index 000000000..c049876ee
 +	BUG_ON(size > btree_bytes(c));
 +
 +	*used_mempool = false;
-+	p = vpmalloc_noprof(size, __GFP_NOWARN|GFP_NOWAIT);
++	p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
 +	if (!p) {
 +		*used_mempool = true;
 +		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
@@ -20204,8 +19973,6 @@ index 000000000..c049876ee
 +	memalloc_nofs_restore(flags);
 +	return p;
 +}
-+#define btree_bounce_alloc(_c, _size, _used_mempool)		\
-+	alloc_hooks(btree_bounce_alloc_noprof(_c, _size, _used_mempool))
 +
 +static void sort_bkey_ptrs(const struct btree *bt,
 +			   struct bkey_packed **ptrs, unsigned nr)
@@ -20625,31 +20392,7 @@ index 000000000..c049876ee
 +	prt_str(out, ": ");
 +}
 +
-+enum btree_err_type {
-+	/*
-+	 * We can repair this locally, and we're after the checksum check so
-+	 * there's no need to try another replica:
-+	 */
-+	BTREE_ERR_FIXABLE,
-+	/*
-+	 * We can repair this if we have to, but we should try reading another
-+	 * replica if we can:
-+	 */
-+	BTREE_ERR_WANT_RETRY,
-+	/*
-+	 * Read another replica if we have one, otherwise consider the whole
-+	 * node bad:
-+	 */
-+	BTREE_ERR_MUST_RETRY,
-+	BTREE_ERR_BAD_NODE,
-+	BTREE_ERR_INCOMPATIBLE,
-+};
-+
-+enum btree_validate_ret {
-+	BTREE_RETRY_READ = 64,
-+};
-+
-+static int __btree_err(enum btree_err_type type,
++static int __btree_err(int ret,
 +		       struct bch_fs *c,
 +		       struct bch_dev *ca,
 +		       struct btree *b,
@@ -20660,7 +20403,6 @@ index 000000000..c049876ee
 +{
 +	struct printbuf out = PRINTBUF;
 +	va_list args;
-+	int ret = -BCH_ERR_fsck_fix;
 +
 +	btree_err_msg(&out, c, ca, b, i, b->written, write);
 +
@@ -20676,27 +20418,26 @@ index 000000000..c049876ee
 +		goto out;
 +	}
 +
-+	if (!have_retry && type == BTREE_ERR_WANT_RETRY)
-+		type = BTREE_ERR_FIXABLE;
-+	if (!have_retry && type == BTREE_ERR_MUST_RETRY)
-+		type = BTREE_ERR_BAD_NODE;
++	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
++		ret = -BCH_ERR_btree_node_read_err_fixable;
++	if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
++		ret = -BCH_ERR_btree_node_read_err_bad_node;
 +
-+	switch (type) {
-+	case BTREE_ERR_FIXABLE:
++	switch (ret) {
++	case -BCH_ERR_btree_node_read_err_fixable:
 +		mustfix_fsck_err(c, "%s", out.buf);
 +		ret = -BCH_ERR_fsck_fix;
 +		break;
-+	case BTREE_ERR_WANT_RETRY:
-+	case BTREE_ERR_MUST_RETRY:
++	case -BCH_ERR_btree_node_read_err_want_retry:
++	case -BCH_ERR_btree_node_read_err_must_retry:
 +		bch2_print_string_as_lines(KERN_ERR, out.buf);
-+		ret = BTREE_RETRY_READ;
 +		break;
-+	case BTREE_ERR_BAD_NODE:
++	case -BCH_ERR_btree_node_read_err_bad_node:
 +		bch2_print_string_as_lines(KERN_ERR, out.buf);
 +		bch2_topology_error(c);
 +		ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
 +		break;
-+	case BTREE_ERR_INCOMPATIBLE:
++	case -BCH_ERR_btree_node_read_err_incompatible:
 +		bch2_print_string_as_lines(KERN_ERR, out.buf);
 +		ret = -BCH_ERR_fsck_errors_not_fixed;
 +		break;
@@ -20713,8 +20454,11 @@ index 000000000..c049876ee
 +({									\
 +	int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\
 +									\
-+	if (_ret != -BCH_ERR_fsck_fix)					\
++	if (_ret != -BCH_ERR_fsck_fix) {				\
++		ret = _ret;						\
 +		goto fsck_err;						\
++	}								\
++									\
 +	*saw_error = true;						\
 +})
 +
@@ -20778,19 +20522,18 @@ index 000000000..c049876ee
 +			 int write, bool have_retry, bool *saw_error)
 +{
 +	unsigned version = le16_to_cpu(i->version);
-+	const char *err;
 +	struct printbuf buf1 = PRINTBUF;
 +	struct printbuf buf2 = PRINTBUF;
 +	int ret = 0;
 +
 +	btree_err_on(!bch2_version_compatible(version),
-+		     BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
++		     -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i,
 +		     "unsupported bset version %u.%u",
 +		     BCH_VERSION_MAJOR(version),
 +		     BCH_VERSION_MINOR(version));
 +
 +	if (btree_err_on(version < c->sb.version_min,
-+			 BTREE_ERR_FIXABLE, c, NULL, b, i,
++			 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
 +			 "bset version %u older than superblock version_min %u",
 +			 version, c->sb.version_min)) {
 +		mutex_lock(&c->sb_lock);
@@ -20801,7 +20544,7 @@ index 000000000..c049876ee
 +
 +	if (btree_err_on(BCH_VERSION_MAJOR(version) >
 +			 BCH_VERSION_MAJOR(c->sb.version),
-+			 BTREE_ERR_FIXABLE, c, NULL, b, i,
++			 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
 +			 "bset version %u newer than superblock version %u",
 +			 version, c->sb.version)) {
 +		mutex_lock(&c->sb_lock);
@@ -20811,11 +20554,11 @@ index 000000000..c049876ee
 +	}
 +
 +	btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
-+		     BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
++		     -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i,
 +		     "BSET_SEPARATE_WHITEOUTS no longer supported");
 +
 +	if (btree_err_on(offset + sectors > btree_sectors(c),
-+			 BTREE_ERR_FIXABLE, c, ca, b, i,
++			 -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
 +			 "bset past end of btree node")) {
 +		i->u64s = 0;
 +		ret = 0;
@@ -20823,12 +20566,12 @@ index 000000000..c049876ee
 +	}
 +
 +	btree_err_on(offset && !i->u64s,
-+		     BTREE_ERR_FIXABLE, c, ca, b, i,
++		     -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
 +		     "empty bset");
 +
 +	btree_err_on(BSET_OFFSET(i) &&
 +		     BSET_OFFSET(i) != offset,
-+		     BTREE_ERR_WANT_RETRY, c, ca, b, i,
++		     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
 +		     "bset at wrong sector offset");
 +
 +	if (!offset) {
@@ -20842,16 +20585,16 @@ index 000000000..c049876ee
 +
 +			/* XXX endianness */
 +			btree_err_on(bp->seq != bn->keys.seq,
-+				     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
++				     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
 +				     "incorrect sequence number (wrong btree node)");
 +		}
 +
 +		btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
-+			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
++			     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
 +			     "incorrect btree id");
 +
 +		btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
-+			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
++			     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
 +			     "incorrect level");
 +
 +		if (!write)
@@ -20868,7 +20611,7 @@ index 000000000..c049876ee
 +			}
 +
 +			btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
-+				     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
++				     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
 +				     "incorrect min_key: got %s should be %s",
 +				     (printbuf_reset(&buf1),
 +				      bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
@@ -20877,7 +20620,7 @@ index 000000000..c049876ee
 +		}
 +
 +		btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
-+			     BTREE_ERR_MUST_RETRY, c, ca, b, i,
++			     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i,
 +			     "incorrect max key %s",
 +			     (printbuf_reset(&buf1),
 +			      bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
@@ -20886,10 +20629,12 @@ index 000000000..c049876ee
 +			compat_btree_node(b->c.level, b->c.btree_id, version,
 +					  BSET_BIG_ENDIAN(i), write, bn);
 +
-+		err = bch2_bkey_format_validate(&bn->format);
-+		btree_err_on(err,
-+			     BTREE_ERR_BAD_NODE, c, ca, b, i,
-+			     "invalid bkey format: %s", err);
++		btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
++			     -BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i,
++			     "invalid bkey format: %s\n  %s", buf1.buf,
++			     (printbuf_reset(&buf2),
++			      bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
++		printbuf_reset(&buf1);
 +
 +		compat_bformat(b->c.level, b->c.btree_id, version,
 +			       BSET_BIG_ENDIAN(i), write,
@@ -20929,14 +20674,14 @@ index 000000000..c049876ee
 +		struct bkey tmp;
 +
 +		if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-+				 BTREE_ERR_FIXABLE, c, NULL, b, i,
++				 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
 +				 "key extends past end of bset")) {
 +			i->u64s = cpu_to_le16((u64 *) k - i->_data);
 +			break;
 +		}
 +
 +		if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
-+				 BTREE_ERR_FIXABLE, c, NULL, b, i,
++				 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i,
 +				 "invalid bkey format %u", k->format)) {
 +			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
 +			memmove_u64s_down(k, bkey_p_next(k),
@@ -20960,7 +20705,7 @@ index 000000000..c049876ee
 +			prt_printf(&buf, "\n  ");
 +			bch2_bkey_val_to_text(&buf, c, u.s_c);
 +
-+			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
++			btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf);
 +
 +			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
 +			memmove_u64s_down(k, bkey_p_next(k),
@@ -20984,7 +20729,7 @@ index 000000000..c049876ee
 +
 +			bch2_dump_bset(c, b, i, 0);
 +
-+			if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) {
++			if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf)) {
 +				i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
 +				memmove_u64s_down(k, bkey_p_next(k),
 +						  (u64 *) vstruct_end(i) - (u64 *) k);
@@ -21027,16 +20772,16 @@ index 000000000..c049876ee
 +	iter->size = (btree_blocks(c) + 1) * 2;
 +
 +	if (bch2_meta_read_fault("btree"))
-+		btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
++		btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
 +			  "dynamic fault");
 +
 +	btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
-+		     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
++		     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
 +		     "bad magic: want %llx, got %llx",
 +		     bset_magic(c), le64_to_cpu(b->data->magic));
 +
 +	btree_err_on(!b->data->keys.seq,
-+		     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
++		     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
 +		     "bad btree header: seq 0");
 +
 +	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
@@ -21044,7 +20789,7 @@ index 000000000..c049876ee
 +			&bkey_i_to_btree_ptr_v2(&b->key)->v;
 +
 +		btree_err_on(b->data->keys.seq != bp->seq,
-+			     BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
++			     -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL,
 +			     "got wrong btree node (seq %llx want %llx)",
 +			     b->data->keys.seq, bp->seq);
 +	}
@@ -21059,7 +20804,7 @@ index 000000000..c049876ee
 +			i = &b->data->keys;
 +
 +			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-+				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
++				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
 +				     "unknown checksum type %llu",
 +				     BSET_CSUM_TYPE(i));
 +
@@ -21067,7 +20812,7 @@ index 000000000..c049876ee
 +			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
 +
 +			btree_err_on(bch2_crc_cmp(csum, b->data->csum),
-+				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
++				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
 +				     "invalid checksum");
 +
 +			ret = bset_encrypt(c, i, b->written << 9);
@@ -21077,7 +20822,7 @@ index 000000000..c049876ee
 +
 +			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
 +				     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
-+				     BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL,
++				     -BCH_ERR_btree_node_read_err_incompatible, c, NULL, b, NULL,
 +				     "btree node does not have NEW_EXTENT_OVERWRITE set");
 +
 +			sectors = vstruct_sectors(b->data, c->block_bits);
@@ -21089,7 +20834,7 @@ index 000000000..c049876ee
 +				break;
 +
 +			btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-+				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
++				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
 +				     "unknown checksum type %llu",
 +				     BSET_CSUM_TYPE(i));
 +
@@ -21097,7 +20842,7 @@ index 000000000..c049876ee
 +			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
 +
 +			btree_err_on(bch2_crc_cmp(csum, bne->csum),
-+				     BTREE_ERR_WANT_RETRY, c, ca, b, i,
++				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i,
 +				     "invalid checksum");
 +
 +			ret = bset_encrypt(c, i, b->written << 9);
@@ -21130,12 +20875,12 @@ index 000000000..c049876ee
 +					true);
 +
 +		btree_err_on(blacklisted && first,
-+			     BTREE_ERR_FIXABLE, c, ca, b, i,
++			     -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
 +			     "first btree node bset has blacklisted journal seq (%llu)",
 +			     le64_to_cpu(i->journal_seq));
 +
 +		btree_err_on(blacklisted && ptr_written,
-+			     BTREE_ERR_FIXABLE, c, ca, b, i,
++			     -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i,
 +			     "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
 +			     le64_to_cpu(i->journal_seq),
 +			     b->written, b->written + sectors, ptr_written);
@@ -21154,7 +20899,7 @@ index 000000000..c049876ee
 +
 +	if (ptr_written) {
 +		btree_err_on(b->written < ptr_written,
-+			     BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
++			     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
 +			     "btree node data missing: expected %u sectors, found %u",
 +			     ptr_written, b->written);
 +	} else {
@@ -21165,7 +20910,7 @@ index 000000000..c049876ee
 +				     !bch2_journal_seq_is_blacklisted(c,
 +								      le64_to_cpu(bne->keys.journal_seq),
 +								      true),
-+				     BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
++				     -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL,
 +				     "found bset signature after last bset");
 +
 +		/*
@@ -21219,7 +20964,7 @@ index 000000000..c049876ee
 +			prt_printf(&buf, "\n  ");
 +			bch2_bkey_val_to_text(&buf, c, u.s_c);
 +
-+			btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
++			btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf);
 +
 +			btree_keys_account_key_drop(&b->nr, 0, k);
 +
@@ -21259,7 +21004,8 @@ index 000000000..c049876ee
 +	printbuf_exit(&buf);
 +	return retry_read;
 +fsck_err:
-+	if (ret == BTREE_RETRY_READ)
++	if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
++	    ret == -BCH_ERR_btree_node_read_err_must_retry)
 +		retry_read = 1;
 +	else
 +		set_btree_node_read_error(b);
@@ -21445,14 +21191,14 @@ index 000000000..c049876ee
 +		}
 +
 +		written2 = btree_node_sectors_written(c, ra->buf[i]);
-+		if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL,
++		if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
 +				 "btree node sectors written mismatch: %u != %u",
 +				 written, written2) ||
 +		    btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
-+				 BTREE_ERR_FIXABLE, c, NULL, b, NULL,
++				 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
 +				 "found bset signature after last bset") ||
 +		    btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
-+				 BTREE_ERR_FIXABLE, c, NULL, b, NULL,
++				 -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL,
 +				 "btree node replicas content mismatch"))
 +			dump_bset_maps = true;
 +
@@ -22349,7 +22095,7 @@ index 000000000..c049876ee
 +}
 diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
 new file mode 100644
-index 000000000..0cadf651e
+index 000000000..cd99bbb00
 --- /dev/null
 +++ b/fs/bcachefs/btree_io.h
 @@ -0,0 +1,228 @@
@@ -22498,8 +22244,8 @@ index 000000000..0cadf651e
 +	__BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
 +	__BTREE_WRITE_ALREADY_STARTED,
 +};
-+#define BTREE_WRITE_ONLY_IF_NEED	(1U << __BTREE_WRITE_ONLY_IF_NEED )
-+#define BTREE_WRITE_ALREADY_STARTED	(1U << __BTREE_WRITE_ALREADY_STARTED)
++#define BTREE_WRITE_ONLY_IF_NEED	BIT(__BTREE_WRITE_ONLY_IF_NEED)
++#define BTREE_WRITE_ALREADY_STARTED	BIT(__BTREE_WRITE_ALREADY_STARTED)
 +
 +void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
 +void bch2_btree_node_write(struct bch_fs *, struct btree *,
@@ -22583,10 +22329,10 @@ index 000000000..0cadf651e
 +#endif /* _BCACHEFS_BTREE_IO_H */
 diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
 new file mode 100644
-index 000000000..e292c5a2a
+index 000000000..21c2bc8a8
 --- /dev/null
 +++ b/fs/bcachefs/btree_iter.c
-@@ -0,0 +1,3214 @@
+@@ -0,0 +1,3194 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -22594,6 +22340,7 @@ index 000000000..e292c5a2a
 +#include "bkey_buf.h"
 +#include "btree_cache.h"
 +#include "btree_iter.h"
++#include "btree_journal_iter.h"
 +#include "btree_key_cache.h"
 +#include "btree_locking.h"
 +#include "btree_update.h"
@@ -22601,9 +22348,8 @@ index 000000000..e292c5a2a
 +#include "error.h"
 +#include "extents.h"
 +#include "journal.h"
-+#include "recovery.h"
 +#include "replicas.h"
-+#include "subvolume.h"
++#include "snapshot.h"
 +#include "trace.h"
 +
 +#include <linux/random.h>
@@ -22624,18 +22370,6 @@ index 000000000..e292c5a2a
 +
 +static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
 +
-+/*
-+ * Unlocks before scheduling
-+ * Note: does not revalidate iterator
-+ */
-+static inline int bch2_trans_cond_resched(struct btree_trans *trans)
-+{
-+	if (need_resched() || race_fault())
-+		return drop_locks_do(trans, (schedule(), 0));
-+	else
-+		return 0;
-+}
-+
 +static inline int __btree_path_cmp(const struct btree_path *l,
 +				   enum btree_id	r_btree_id,
 +				   bool			r_cached,
@@ -23609,7 +23343,7 @@ index 000000000..e292c5a2a
 +	/*
 +	 * We used to assert that all paths had been traversed here
 +	 * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
-+	 * path->Should_be_locked is not set yet, we we might have unlocked and
++	 * path->should_be_locked is not set yet, we might have unlocked and
 +	 * then failed to relock a path - that's fine.
 +	 */
 +err:
@@ -23942,14 +23676,14 @@ index 000000000..e292c5a2a
 +	__bch2_path_free(trans, path);
 +}
 +
-+void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
++void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
 +{
 +	panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
 +	      trans->restart_count, restart_count,
 +	      (void *) trans->last_begin_ip);
 +}
 +
-+void bch2_trans_in_restart_error(struct btree_trans *trans)
++void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
 +{
 +	panic("in transaction restart: %s, last restarted by %pS\n",
 +	      bch2_err_str(trans->restarted),
@@ -25321,19 +25055,9 @@ index 000000000..e292c5a2a
 +	iter->key_cache_path = NULL;
 +}
 +
-+static inline void bch2_trans_iter_init_inlined(struct btree_trans *trans,
-+			  struct btree_iter *iter,
-+			  unsigned btree_id, struct bpos pos,
-+			  unsigned flags)
-+{
-+	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
-+			       bch2_btree_iter_flags(trans, btree_id, flags),
-+			       _RET_IP_);
-+}
-+
 +void bch2_trans_iter_init_outlined(struct btree_trans *trans,
 +			  struct btree_iter *iter,
-+			  unsigned btree_id, struct bpos pos,
++			  enum btree_id btree_id, struct bpos pos,
 +			  unsigned flags)
 +{
 +	bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
@@ -25349,9 +25073,9 @@ index 000000000..e292c5a2a
 +			       unsigned depth,
 +			       unsigned flags)
 +{
-+       flags |= BTREE_ITER_NOT_EXTENTS;
-+       flags |= __BTREE_ITER_ALL_SNAPSHOTS;
-+       flags |= BTREE_ITER_ALL_SNAPSHOTS;
++	flags |= BTREE_ITER_NOT_EXTENTS;
++	flags |= __BTREE_ITER_ALL_SNAPSHOTS;
++	flags |= BTREE_ITER_ALL_SNAPSHOTS;
 +
 +	bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
 +			       __bch2_btree_iter_flags(trans, btree_id, flags),
@@ -25509,12 +25233,14 @@ index 000000000..e292c5a2a
 +#ifdef __KERNEL__
 +	p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
 +#endif
-+	if (!p)
++	if (!p) {
 +		p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
-+	/*
-+	 * paths need to be zeroed, bch2_check_for_deadlock looks at paths in
-+	 * other threads
-+	 */
++		/*
++		 * paths need to be zeroed, bch2_check_for_deadlock looks at
++		 * paths in other threads
++		 */
++		memset(p, 0, paths_bytes);
++	}
 +
 +	trans->paths		= p; p += paths_bytes;
 +	trans->updates		= p; p += updates_bytes;
@@ -25803,10 +25529,10 @@ index 000000000..e292c5a2a
 +}
 diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
 new file mode 100644
-index 000000000..c472aa8c5
+index 000000000..4469b2e16
 --- /dev/null
 +++ b/fs/bcachefs/btree_iter.h
-@@ -0,0 +1,924 @@
+@@ -0,0 +1,940 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_ITER_H
 +#define _BCACHEFS_BTREE_ITER_H
@@ -26030,6 +25756,22 @@ index 000000000..c472aa8c5
 +				 unsigned, unsigned, unsigned, unsigned long);
 +struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
 +
++/*
++ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
++ * different snapshot:
++ */
++static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
++{
++	struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
++
++	if (k.k && bpos_eq(path->pos, k.k->p))
++		return k;
++
++	bkey_init(u);
++	u->p = path->pos;
++	return (struct bkey_s_c) { u, NULL };
++}
++
 +struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
 +					struct btree_iter *, struct bpos);
 +
@@ -26074,7 +25816,7 @@ index 000000000..c472aa8c5
 +	return restart_count != trans->restart_count;
 +}
 +
-+void bch2_trans_restart_error(struct btree_trans *, u32);
++void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
 +
 +static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
 +						   u32 restart_count)
@@ -26083,7 +25825,7 @@ index 000000000..c472aa8c5
 +		bch2_trans_restart_error(trans, restart_count);
 +}
 +
-+void bch2_trans_in_restart_error(struct btree_trans *);
++void __noreturn bch2_trans_in_restart_error(struct btree_trans *);
 +
 +static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
 +{
@@ -26256,7 +25998,7 @@ index 000000000..c472aa8c5
 +}
 +
 +void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
-+			  unsigned, struct bpos, unsigned);
++			  enum btree_id, struct bpos, unsigned);
 +
 +static inline void bch2_trans_iter_init(struct btree_trans *trans,
 +			  struct btree_iter *iter,
@@ -26731,6 +26473,606 @@ index 000000000..c472aa8c5
 +int bch2_fs_btree_iter_init(struct bch_fs *);
 +
 +#endif /* _BCACHEFS_BTREE_ITER_H */
+diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
+new file mode 100644
+index 000000000..58a981bcf
+--- /dev/null
++++ b/fs/bcachefs/btree_journal_iter.c
+@@ -0,0 +1,531 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "bset.h"
++#include "btree_journal_iter.h"
++#include "journal_io.h"
++
++#include <linux/sort.h>
++
++/*
++ * For managing keys we read from the journal: until journal replay works normal
++ * btree lookups need to be able to find and return keys from the journal where
++ * they overwrite what's in the btree, so we have a special iterator and
++ * operations for the regular btree iter code to use:
++ */
++
++static int __journal_key_cmp(enum btree_id	l_btree_id,
++			     unsigned		l_level,
++			     struct bpos	l_pos,
++			     const struct journal_key *r)
++{
++	return (cmp_int(l_btree_id,	r->btree_id) ?:
++		cmp_int(l_level,	r->level) ?:
++		bpos_cmp(l_pos,	r->k->k.p));
++}
++
++static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
++{
++	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
++}
++
++static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
++{
++	size_t gap_size = keys->size - keys->nr;
++
++	if (idx >= keys->gap)
++		idx += gap_size;
++	return idx;
++}
++
++static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
++{
++	return keys->d + idx_to_pos(keys, idx);
++}
++
++static size_t __bch2_journal_key_search(struct journal_keys *keys,
++					enum btree_id id, unsigned level,
++					struct bpos pos)
++{
++	size_t l = 0, r = keys->nr, m;
++
++	while (l < r) {
++		m = l + ((r - l) >> 1);
++		if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
++			l = m + 1;
++		else
++			r = m;
++	}
++
++	BUG_ON(l < keys->nr &&
++	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
++
++	BUG_ON(l &&
++	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
++
++	return l;
++}
++
++static size_t bch2_journal_key_search(struct journal_keys *keys,
++				      enum btree_id id, unsigned level,
++				      struct bpos pos)
++{
++	return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
++}
++
++struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
++					   unsigned level, struct bpos pos,
++					   struct bpos end_pos, size_t *idx)
++{
++	struct journal_keys *keys = &c->journal_keys;
++	unsigned iters = 0;
++	struct journal_key *k;
++search:
++	if (!*idx)
++		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
++
++	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
++		if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
++			return NULL;
++
++		if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
++		    !k->overwritten)
++			return k->k;
++
++		(*idx)++;
++		iters++;
++		if (iters == 10) {
++			*idx = 0;
++			goto search;
++		}
++	}
++
++	return NULL;
++}
++
++struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
++					   unsigned level, struct bpos pos)
++{
++	size_t idx = 0;
++
++	return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
++}
++
++static void journal_iters_fix(struct bch_fs *c)
++{
++	struct journal_keys *keys = &c->journal_keys;
++	/* The key we just inserted is immediately before the gap: */
++	size_t gap_end = keys->gap + (keys->size - keys->nr);
++	struct btree_and_journal_iter *iter;
++
++	/*
++	 * If an iterator points one after the key we just inserted, decrement
++	 * the iterator so it points at the key we just inserted - if the
++	 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
++	 * handle that:
++	 */
++	list_for_each_entry(iter, &c->journal_iters, journal.list)
++		if (iter->journal.idx == gap_end)
++			iter->journal.idx = keys->gap - 1;
++}
++
++static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
++{
++	struct journal_keys *keys = &c->journal_keys;
++	struct journal_iter *iter;
++	size_t gap_size = keys->size - keys->nr;
++
++	list_for_each_entry(iter, &c->journal_iters, list) {
++		if (iter->idx > old_gap)
++			iter->idx -= gap_size;
++		if (iter->idx >= new_gap)
++			iter->idx += gap_size;
++	}
++}
++
++int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
++				 unsigned level, struct bkey_i *k)
++{
++	struct journal_key n = {
++		.btree_id	= id,
++		.level		= level,
++		.k		= k,
++		.allocated	= true,
++		/*
++		 * Ensure these keys are done last by journal replay, to unblock
++		 * journal reclaim:
++		 */
++		.journal_seq	= U32_MAX,
++	};
++	struct journal_keys *keys = &c->journal_keys;
++	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
++
++	BUG_ON(test_bit(BCH_FS_RW, &c->flags));
++
++	if (idx < keys->size &&
++	    journal_key_cmp(&n, &keys->d[idx]) == 0) {
++		if (keys->d[idx].allocated)
++			kfree(keys->d[idx].k);
++		keys->d[idx] = n;
++		return 0;
++	}
++
++	if (idx > keys->gap)
++		idx -= keys->size - keys->nr;
++
++	if (keys->nr == keys->size) {
++		struct journal_keys new_keys = {
++			.nr			= keys->nr,
++			.size			= max_t(size_t, keys->size, 8) * 2,
++		};
++
++		new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
++		if (!new_keys.d) {
++			bch_err(c, "%s: error allocating new key array (size %zu)",
++				__func__, new_keys.size);
++			return -BCH_ERR_ENOMEM_journal_key_insert;
++		}
++
++		/* Since @keys was full, there was no gap: */
++		memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
++		kvfree(keys->d);
++		*keys = new_keys;
++
++		/* And now the gap is at the end: */
++		keys->gap = keys->nr;
++	}
++
++	journal_iters_move_gap(c, keys->gap, idx);
++
++	move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
++	keys->gap = idx;
++
++	keys->nr++;
++	keys->d[keys->gap++] = n;
++
++	journal_iters_fix(c);
++
++	return 0;
++}
++
++/*
++ * Can only be used from the recovery thread while we're still RO - can't be
++ * used once we've got RW, as journal_keys is at that point used by multiple
++ * threads:
++ */
++int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
++			    unsigned level, struct bkey_i *k)
++{
++	struct bkey_i *n;
++	int ret;
++
++	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
++	if (!n)
++		return -BCH_ERR_ENOMEM_journal_key_insert;
++
++	bkey_copy(n, k);
++	ret = bch2_journal_key_insert_take(c, id, level, n);
++	if (ret)
++		kfree(n);
++	return ret;
++}
++
++int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
++			    unsigned level, struct bpos pos)
++{
++	struct bkey_i whiteout;
++
++	bkey_init(&whiteout.k);
++	whiteout.k.p = pos;
++
++	return bch2_journal_key_insert(c, id, level, &whiteout);
++}
++
++void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
++				  unsigned level, struct bpos pos)
++{
++	struct journal_keys *keys = &c->journal_keys;
++	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
++
++	if (idx < keys->size &&
++	    keys->d[idx].btree_id	== btree &&
++	    keys->d[idx].level		== level &&
++	    bpos_eq(keys->d[idx].k->k.p, pos))
++		keys->d[idx].overwritten = true;
++}
++
++static void bch2_journal_iter_advance(struct journal_iter *iter)
++{
++	if (iter->idx < iter->keys->size) {
++		iter->idx++;
++		if (iter->idx == iter->keys->gap)
++			iter->idx += iter->keys->size - iter->keys->nr;
++	}
++}
++
++static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
++{
++	struct journal_key *k = iter->keys->d + iter->idx;
++
++	while (k < iter->keys->d + iter->keys->size &&
++	       k->btree_id	== iter->btree_id &&
++	       k->level		== iter->level) {
++		if (!k->overwritten)
++			return bkey_i_to_s_c(k->k);
++
++		bch2_journal_iter_advance(iter);
++		k = iter->keys->d + iter->idx;
++	}
++
++	return bkey_s_c_null;
++}
++
++static void bch2_journal_iter_exit(struct journal_iter *iter)
++{
++	list_del(&iter->list);
++}
++
++static void bch2_journal_iter_init(struct bch_fs *c,
++				   struct journal_iter *iter,
++				   enum btree_id id, unsigned level,
++				   struct bpos pos)
++{
++	iter->btree_id	= id;
++	iter->level	= level;
++	iter->keys	= &c->journal_keys;
++	iter->idx	= bch2_journal_key_search(&c->journal_keys, id, level, pos);
++}
++
++static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
++{
++	return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
++						iter->b, &iter->unpacked);
++}
++
++static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
++{
++	bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
++}
++
++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
++{
++	if (bpos_eq(iter->pos, SPOS_MAX))
++		iter->at_end = true;
++	else
++		iter->pos = bpos_successor(iter->pos);
++}
++
++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
++{
++	struct bkey_s_c btree_k, journal_k, ret;
++again:
++	if (iter->at_end)
++		return bkey_s_c_null;
++
++	while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
++	       bpos_lt(btree_k.k->p, iter->pos))
++		bch2_journal_iter_advance_btree(iter);
++
++	while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
++	       bpos_lt(journal_k.k->p, iter->pos))
++		bch2_journal_iter_advance(&iter->journal);
++
++	ret = journal_k.k &&
++		(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
++		? journal_k
++		: btree_k;
++
++	if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
++		ret = bkey_s_c_null;
++
++	if (ret.k) {
++		iter->pos = ret.k->p;
++		if (bkey_deleted(ret.k)) {
++			bch2_btree_and_journal_iter_advance(iter);
++			goto again;
++		}
++	} else {
++		iter->pos = SPOS_MAX;
++		iter->at_end = true;
++	}
++
++	return ret;
++}
++
++void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
++{
++	bch2_journal_iter_exit(&iter->journal);
++}
++
++void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
++						  struct bch_fs *c,
++						  struct btree *b,
++						  struct btree_node_iter node_iter,
++						  struct bpos pos)
++{
++	memset(iter, 0, sizeof(*iter));
++
++	iter->b = b;
++	iter->node_iter = node_iter;
++	bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
++	INIT_LIST_HEAD(&iter->journal.list);
++	iter->pos = b->data->min_key;
++	iter->at_end = false;
++}
++
++/*
++ * this version is used by btree_gc before filesystem has gone RW and
++ * multithreaded, so uses the journal_iters list:
++ */
++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
++						struct bch_fs *c,
++						struct btree *b)
++{
++	struct btree_node_iter node_iter;
++
++	bch2_btree_node_iter_init_from_start(&node_iter, b);
++	__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
++	list_add(&iter->journal.list, &c->journal_iters);
++}
++
++/* sort and dedup all keys in the journal: */
++
++void bch2_journal_entries_free(struct bch_fs *c)
++{
++	struct journal_replay **i;
++	struct genradix_iter iter;
++
++	genradix_for_each(&c->journal_entries, iter, i)
++		if (*i)
++			kvpfree(*i, offsetof(struct journal_replay, j) +
++				vstruct_bytes(&(*i)->j));
++	genradix_free(&c->journal_entries);
++}
++
++/*
++ * When keys compare equal, oldest compares first:
++ */
++static int journal_sort_key_cmp(const void *_l, const void *_r)
++{
++	const struct journal_key *l = _l;
++	const struct journal_key *r = _r;
++
++	return  journal_key_cmp(l, r) ?:
++		cmp_int(l->journal_seq, r->journal_seq) ?:
++		cmp_int(l->journal_offset, r->journal_offset);
++}
++
++void bch2_journal_keys_free(struct journal_keys *keys)
++{
++	struct journal_key *i;
++
++	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
++	keys->gap = keys->nr;
++
++	for (i = keys->d; i < keys->d + keys->nr; i++)
++		if (i->allocated)
++			kfree(i->k);
++
++	kvfree(keys->d);
++	keys->d = NULL;
++	keys->nr = keys->gap = keys->size = 0;
++}
++
++static void __journal_keys_sort(struct journal_keys *keys)
++{
++	struct journal_key *src, *dst;
++
++	sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
++
++	src = dst = keys->d;
++	while (src < keys->d + keys->nr) {
++		while (src + 1 < keys->d + keys->nr &&
++		       src[0].btree_id	== src[1].btree_id &&
++		       src[0].level	== src[1].level &&
++		       bpos_eq(src[0].k->k.p, src[1].k->k.p))
++			src++;
++
++		*dst++ = *src++;
++	}
++
++	keys->nr = dst - keys->d;
++}
++
++int bch2_journal_keys_sort(struct bch_fs *c)
++{
++	struct genradix_iter iter;
++	struct journal_replay *i, **_i;
++	struct jset_entry *entry;
++	struct bkey_i *k;
++	struct journal_keys *keys = &c->journal_keys;
++	size_t nr_keys = 0, nr_read = 0;
++
++	genradix_for_each(&c->journal_entries, iter, _i) {
++		i = *_i;
++
++		if (!i || i->ignore)
++			continue;
++
++		for_each_jset_key(k, entry, &i->j)
++			nr_keys++;
++	}
++
++	if (!nr_keys)
++		return 0;
++
++	keys->size = roundup_pow_of_two(nr_keys);
++
++	keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
++	if (!keys->d) {
++		bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
++			nr_keys);
++
++		do {
++			keys->size >>= 1;
++			keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
++		} while (!keys->d && keys->size > nr_keys / 8);
++
++		if (!keys->d) {
++			bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
++				keys->size);
++			return -BCH_ERR_ENOMEM_journal_keys_sort;
++		}
++	}
++
++	genradix_for_each(&c->journal_entries, iter, _i) {
++		i = *_i;
++
++		if (!i || i->ignore)
++			continue;
++
++		cond_resched();
++
++		for_each_jset_key(k, entry, &i->j) {
++			if (keys->nr == keys->size) {
++				__journal_keys_sort(keys);
++
++				if (keys->nr > keys->size * 7 / 8) {
++					bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
++						keys->nr, keys->size, nr_read, nr_keys);
++					return -BCH_ERR_ENOMEM_journal_keys_sort;
++				}
++			}
++
++			keys->d[keys->nr++] = (struct journal_key) {
++				.btree_id	= entry->btree_id,
++				.level		= entry->level,
++				.k		= k,
++				.journal_seq	= le64_to_cpu(i->j.seq),
++				.journal_offset	= k->_data - i->j._data,
++			};
++
++			nr_read++;
++		}
++	}
++
++	__journal_keys_sort(keys);
++	keys->gap = keys->nr;
++
++	bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
++	return 0;
++}
+diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
+new file mode 100644
+index 000000000..5d64e7e22
+--- /dev/null
++++ b/fs/bcachefs/btree_journal_iter.h
+@@ -0,0 +1,57 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
++#define _BCACHEFS_BTREE_JOURNAL_ITER_H
++
++struct journal_iter {
++	struct list_head	list;
++	enum btree_id		btree_id;
++	unsigned		level;
++	size_t			idx;
++	struct journal_keys	*keys;
++};
++
++/*
++ * Iterate over keys in the btree, with keys from the journal overlaid on top:
++ */
++
++struct btree_and_journal_iter {
++	struct btree		*b;
++	struct btree_node_iter	node_iter;
++	struct bkey		unpacked;
++
++	struct journal_iter	journal;
++	struct bpos		pos;
++	bool			at_end;
++};
++
++struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
++				unsigned, struct bpos, struct bpos, size_t *);
++struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
++					   unsigned, struct bpos);
++
++int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
++				 unsigned, struct bkey_i *);
++int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
++			    unsigned, struct bkey_i *);
++int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
++			    unsigned, struct bpos);
++void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
++				  unsigned, struct bpos);
++
++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
++
++void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
++void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
++				struct bch_fs *, struct btree *,
++				struct btree_node_iter, struct bpos);
++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
++						struct bch_fs *,
++						struct btree *);
++
++void bch2_journal_keys_free(struct journal_keys *);
++void bch2_journal_entries_free(struct bch_fs *);
++
++int bch2_journal_keys_sort(struct bch_fs *);
++
++#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
 diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
 new file mode 100644
 index 000000000..f7c001d42
@@ -27881,7 +28223,7 @@ index 000000000..be3acde2c
 +#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
 diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
 new file mode 100644
-index 000000000..d7fd87149
+index 000000000..0b0f9d607
 --- /dev/null
 +++ b/fs/bcachefs/btree_locking.c
 @@ -0,0 +1,797 @@
@@ -28274,7 +28616,7 @@ index 000000000..d7fd87149
 +	six_lock_readers_add(&b->lock, readers);
 +
 +	if (ret)
-+		mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent);
++		mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED);
 +
 +	return ret;
 +}
@@ -28438,7 +28780,7 @@ index 000000000..d7fd87149
 +	trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level);
 +	return false;
 +success:
-+	mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
++	mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
 +	return true;
 +}
 +
@@ -28553,7 +28895,7 @@ index 000000000..d7fd87149
 +		} else {
 +			if (btree_node_intent_locked(path, l)) {
 +				six_lock_downgrade(&path->l[l].b->c.lock);
-+				mark_btree_node_locked_noreset(path, l, SIX_LOCK_read);
++				mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED);
 +			}
 +			break;
 +		}
@@ -28684,10 +29026,10 @@ index 000000000..d7fd87149
 +#endif
 diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
 new file mode 100644
-index 000000000..f3e58aa27
+index 000000000..22e2cd391
 --- /dev/null
 +++ b/fs/bcachefs/btree_locking.h
-@@ -0,0 +1,424 @@
+@@ -0,0 +1,423 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_LOCKING_H
 +#define _BCACHEFS_BTREE_LOCKING_H
@@ -28700,9 +29042,8 @@ index 000000000..f3e58aa27
 + * updating the iterator state
 + */
 +
-+#include <linux/six.h>
-+
 +#include "btree_iter.h"
++#include "six.h"
 +
 +void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
 +
@@ -28871,7 +29212,7 @@ index 000000000..f3e58aa27
 +	EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
 +	EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write);
 +
-+	mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
++	mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
 +
 +	trans_for_each_path_with_node(trans, b, linked)
 +		linked->l[b->c.level].lock_seq++;
@@ -28984,7 +29325,7 @@ index 000000000..f3e58aa27
 +	 * write lock: thus, we need to tell the cycle detector we have a write
 +	 * lock _before_ taking the lock:
 +	 */
-+	mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
++	mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED);
 +
 +	return likely(six_trylock_write(&b->lock))
 +		? 0
@@ -29112,19 +29453,1180 @@ index 000000000..f3e58aa27
 +#endif
 +
 +#endif /* _BCACHEFS_BTREE_LOCKING_H */
+diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
+new file mode 100644
+index 000000000..eafb0388e
+--- /dev/null
++++ b/fs/bcachefs/btree_trans_commit.c
+@@ -0,0 +1,1156 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "btree_gc.h"
++#include "btree_io.h"
++#include "btree_iter.h"
++#include "btree_journal_iter.h"
++#include "btree_key_cache.h"
++#include "btree_update_interior.h"
++#include "btree_write_buffer.h"
++#include "buckets.h"
++#include "errcode.h"
++#include "error.h"
++#include "journal.h"
++#include "journal_reclaim.h"
++#include "replicas.h"
++#include "snapshot.h"
++
++#include <linux/prefetch.h>
++
++static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
++{
++#ifdef CONFIG_BCACHEFS_DEBUG
++	struct bch_fs *c = trans->c;
++	struct bkey u;
++	struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
++
++	if (unlikely(trans->journal_replay_not_finished)) {
++		struct bkey_i *j_k =
++			bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
++
++		if (j_k)
++			k = bkey_i_to_s_c(j_k);
++	}
++
++	u = *k.k;
++	u.needs_whiteout = i->old_k.needs_whiteout;
++
++	BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
++	BUG_ON(i->old_v != k.v);
++#endif
++}
++
++static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
++{
++	return i->path->l + i->level;
++}
++
++static inline bool same_leaf_as_prev(struct btree_trans *trans,
++				     struct btree_insert_entry *i)
++{
++	return i != trans->updates &&
++		insert_l(&i[0])->b == insert_l(&i[-1])->b;
++}
++
++static inline bool same_leaf_as_next(struct btree_trans *trans,
++				     struct btree_insert_entry *i)
++{
++	return i + 1 < trans->updates + trans->nr_updates &&
++		insert_l(&i[0])->b == insert_l(&i[1])->b;
++}
++
++inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
++					   struct btree_path *path,
++					   struct btree *b)
++{
++	struct bch_fs *c = trans->c;
++
++	if (unlikely(btree_node_just_written(b)) &&
++	    bch2_btree_post_write_cleanup(c, b))
++		bch2_trans_node_reinit_iter(trans, b);
++
++	/*
++	 * If the last bset has been written, or if it's gotten too big - start
++	 * a new bset to insert into:
++	 */
++	if (want_new_bset(c, b))
++		bch2_btree_init_next(trans, b);
++}
++
++/* Inserting into a given leaf node (last stage of insert): */
++
++/* Handle overwrites and do insert, for non extents: */
++bool bch2_btree_bset_insert_key(struct btree_trans *trans,
++				struct btree_path *path,
++				struct btree *b,
++				struct btree_node_iter *node_iter,
++				struct bkey_i *insert)
++{
++	struct bkey_packed *k;
++	unsigned clobber_u64s = 0, new_u64s = 0;
++
++	EBUG_ON(btree_node_just_written(b));
++	EBUG_ON(bset_written(b, btree_bset_last(b)));
++	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
++	EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
++	EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
++	EBUG_ON(insert->k.u64s >
++		bch_btree_keys_u64s_remaining(trans->c, b));
++	EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
++
++	k = bch2_btree_node_iter_peek_all(node_iter, b);
++	if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
++		k = NULL;
++
++	/* @k is the key being overwritten/deleted, if any: */
++	EBUG_ON(k && bkey_deleted(k));
++
++	/* Deleting, but not found? nothing to do: */
++	if (bkey_deleted(&insert->k) && !k)
++		return false;
++
++	if (bkey_deleted(&insert->k)) {
++		/* Deleting: */
++		btree_account_key_drop(b, k);
++		k->type = KEY_TYPE_deleted;
++
++		if (k->needs_whiteout)
++			push_whiteout(trans->c, b, insert->k.p);
++		k->needs_whiteout = false;
++
++		if (k >= btree_bset_last(b)->start) {
++			clobber_u64s = k->u64s;
++			bch2_bset_delete(b, k, clobber_u64s);
++			goto fix_iter;
++		} else {
++			bch2_btree_path_fix_key_modified(trans, b, k);
++		}
++
++		return true;
++	}
++
++	if (k) {
++		/* Overwriting: */
++		btree_account_key_drop(b, k);
++		k->type = KEY_TYPE_deleted;
++
++		insert->k.needs_whiteout = k->needs_whiteout;
++		k->needs_whiteout = false;
++
++		if (k >= btree_bset_last(b)->start) {
++			clobber_u64s = k->u64s;
++			goto overwrite;
++		} else {
++			bch2_btree_path_fix_key_modified(trans, b, k);
++		}
++	}
++
++	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
++overwrite:
++	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
++	new_u64s = k->u64s;
++fix_iter:
++	if (clobber_u64s != new_u64s)
++		bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
++					 clobber_u64s, new_u64s);
++	return true;
++}
++
++static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
++			       unsigned i, u64 seq)
++{
++	struct bch_fs *c = container_of(j, struct bch_fs, journal);
++	struct btree_write *w = container_of(pin, struct btree_write, journal);
++	struct btree *b = container_of(w, struct btree, writes[i]);
++	struct btree_trans trans;
++	unsigned long old, new, v;
++	unsigned idx = w - b->writes;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
++	v = READ_ONCE(b->flags);
++
++	do {
++		old = new = v;
++
++		if (!(old & (1 << BTREE_NODE_dirty)) ||
++		    !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
++		    w->journal.seq != seq)
++			break;
++
++		new &= ~BTREE_WRITE_TYPE_MASK;
++		new |= BTREE_WRITE_journal_reclaim;
++		new |= 1 << BTREE_NODE_need_write;
++	} while ((v = cmpxchg(&b->flags, old, new)) != old);
++
++	btree_node_write_if_need(c, b, SIX_LOCK_read);
++	six_unlock_read(&b->c.lock);
++
++	bch2_trans_exit(&trans);
++	return 0;
++}
++
++int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
++{
++	return __btree_node_flush(j, pin, 0, seq);
++}
++
++int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
++{
++	return __btree_node_flush(j, pin, 1, seq);
++}
++
++inline void bch2_btree_add_journal_pin(struct bch_fs *c,
++				       struct btree *b, u64 seq)
++{
++	struct btree_write *w = btree_current_write(b);
++
++	bch2_journal_pin_add(&c->journal, seq, &w->journal,
++			     btree_node_write_idx(b) == 0
++			     ? bch2_btree_node_flush0
++			     : bch2_btree_node_flush1);
++}
++
++/**
++ * btree_insert_key - insert a key one key into a leaf node
++ */
++inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
++				       struct btree_path *path,
++				       struct bkey_i *insert,
++				       u64 journal_seq)
++{
++	struct bch_fs *c = trans->c;
++	struct btree *b = path_l(path)->b;
++	struct bset_tree *t = bset_tree_last(b);
++	struct bset *i = bset(b, t);
++	int old_u64s = bset_u64s(t);
++	int old_live_u64s = b->nr.live_u64s;
++	int live_u64s_added, u64s_added;
++
++	if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
++					&path_l(path)->iter, insert)))
++		return;
++
++	i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
++
++	bch2_btree_add_journal_pin(c, b, journal_seq);
++
++	if (unlikely(!btree_node_dirty(b))) {
++		EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
++		set_btree_node_dirty_acct(c, b);
++	}
++
++	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
++	u64s_added = (int) bset_u64s(t) - old_u64s;
++
++	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
++		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
++	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
++		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
++
++	if (u64s_added > live_u64s_added &&
++	    bch2_maybe_compact_whiteouts(c, b))
++		bch2_trans_node_reinit_iter(trans, b);
++}
++
++/* Cached btree updates: */
++
++/* Normal update interface: */
++
++static inline void btree_insert_entry_checks(struct btree_trans *trans,
++					     struct btree_insert_entry *i)
++{
++	BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
++	BUG_ON(i->cached	!= i->path->cached);
++	BUG_ON(i->level		!= i->path->level);
++	BUG_ON(i->btree_id	!= i->path->btree_id);
++	EBUG_ON(!i->level &&
++		!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
++		test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
++		i->k->k.p.snapshot &&
++		bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
++}
++
++static noinline int
++bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
++				   unsigned long trace_ip)
++{
++	return drop_locks_do(trans,
++		bch2_journal_preres_get(&trans->c->journal,
++			&trans->journal_preres,
++			trans->journal_preres_u64s,
++			(flags & BCH_WATERMARK_MASK)));
++}
++
++static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
++						      unsigned flags)
++{
++	return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
++				    trans->journal_u64s, flags);
++}
++
++#define JSET_ENTRY_LOG_U64s		4
++
++static noinline void journal_transaction_name(struct btree_trans *trans)
++{
++	struct bch_fs *c = trans->c;
++	struct journal *j = &c->journal;
++	struct jset_entry *entry =
++		bch2_journal_add_entry(j, &trans->journal_res,
++				       BCH_JSET_ENTRY_log, 0, 0,
++				       JSET_ENTRY_LOG_U64s);
++	struct jset_entry_log *l =
++		container_of(entry, struct jset_entry_log, entry);
++
++	strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
++}
++
++static inline int btree_key_can_insert(struct btree_trans *trans,
++				       struct btree *b, unsigned u64s)
++{
++	struct bch_fs *c = trans->c;
++
++	if (!bch2_btree_node_insert_fits(c, b, u64s))
++		return -BCH_ERR_btree_insert_btree_node_full;
++
++	return 0;
++}
++
++static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
++				       struct btree_path *path, unsigned u64s)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_cached *ck = (void *) path->l[0].b;
++	struct btree_insert_entry *i;
++	unsigned new_u64s;
++	struct bkey_i *new_k;
++
++	EBUG_ON(path->level);
++
++	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
++	    bch2_btree_key_cache_must_wait(c) &&
++	    !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
++		return -BCH_ERR_btree_insert_need_journal_reclaim;
++
++	/*
++	 * bch2_varint_decode can read past the end of the buffer by at most 7
++	 * bytes (it won't be used):
++	 */
++	u64s += 1;
++
++	if (u64s <= ck->u64s)
++		return 0;
++
++	new_u64s	= roundup_pow_of_two(u64s);
++	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
++	if (!new_k) {
++		bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
++			bch2_btree_ids[path->btree_id], new_u64s);
++		return -BCH_ERR_ENOMEM_btree_key_cache_insert;
++	}
++
++	trans_for_each_update(trans, i)
++		if (i->old_v == &ck->k->v)
++			i->old_v = &new_k->v;
++
++	ck->u64s	= new_u64s;
++	ck->k		= new_k;
++	return 0;
++}
++
++/* Triggers: */
++
++static int run_one_mem_trigger(struct btree_trans *trans,
++			       struct btree_insert_entry *i,
++			       unsigned flags)
++{
++	struct bkey_s_c old = { &i->old_k, i->old_v };
++	struct bkey_i *new = i->k;
++	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
++	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
++	int ret;
++
++	verify_update_old_key(trans, i);
++
++	if (unlikely(flags & BTREE_TRIGGER_NORUN))
++		return 0;
++
++	if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id))
++		return 0;
++
++	if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
++	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
++		ret   = bch2_mark_key(trans, i->btree_id, i->level,
++				old, bkey_i_to_s_c(new),
++				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
++	} else {
++		struct bkey		_deleted = KEY(0, 0, 0);
++		struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
++
++		_deleted.p = i->path->pos;
++
++		ret   = bch2_mark_key(trans, i->btree_id, i->level,
++				deleted, bkey_i_to_s_c(new),
++				BTREE_TRIGGER_INSERT|flags) ?:
++			bch2_mark_key(trans, i->btree_id, i->level,
++				old, deleted,
++				BTREE_TRIGGER_OVERWRITE|flags);
++	}
++
++	return ret;
++}
++
++static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
++				 bool overwrite)
++{
++	/*
++	 * Transactional triggers create new btree_insert_entries, so we can't
++	 * pass them a pointer to a btree_insert_entry, that memory is going to
++	 * move:
++	 */
++	struct bkey old_k = i->old_k;
++	struct bkey_s_c old = { &old_k, i->old_v };
++	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
++	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
++
++	verify_update_old_key(trans, i);
++
++	if ((i->flags & BTREE_TRIGGER_NORUN) ||
++	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
++		return 0;
++
++	if (!i->insert_trigger_run &&
++	    !i->overwrite_trigger_run &&
++	    old_ops->trans_trigger == new_ops->trans_trigger &&
++	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
++		i->overwrite_trigger_run = true;
++		i->insert_trigger_run = true;
++		return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
++					   BTREE_TRIGGER_INSERT|
++					   BTREE_TRIGGER_OVERWRITE|
++					   i->flags) ?: 1;
++	} else if (overwrite && !i->overwrite_trigger_run) {
++		i->overwrite_trigger_run = true;
++		return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
++	} else if (!overwrite && !i->insert_trigger_run) {
++		i->insert_trigger_run = true;
++		return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
++	} else {
++		return 0;
++	}
++}
++
++static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
++			      struct btree_insert_entry *btree_id_start)
++{
++	struct btree_insert_entry *i;
++	bool trans_trigger_run;
++	int ret, overwrite;
++
++	for (overwrite = 1; overwrite >= 0; --overwrite) {
++
++		/*
++		 * Running triggers will append more updates to the list of updates as
++		 * we're walking it:
++		 */
++		do {
++			trans_trigger_run = false;
++
++			for (i = btree_id_start;
++			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
++			     i++) {
++				if (i->btree_id != btree_id)
++					continue;
++
++				ret = run_one_trans_trigger(trans, i, overwrite);
++				if (ret < 0)
++					return ret;
++				if (ret)
++					trans_trigger_run = true;
++			}
++		} while (trans_trigger_run);
++	}
++
++	return 0;
++}
++
++static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
++{
++	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
++	unsigned btree_id = 0;
++	int ret = 0;
++
++	/*
++	 *
++	 * For a given btree, this algorithm runs insert triggers before
++	 * overwrite triggers: this is so that when extents are being moved
++	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
++	 * they are re-added.
++	 */
++	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
++		if (btree_id == BTREE_ID_alloc)
++			continue;
++
++		while (btree_id_start < trans->updates + trans->nr_updates &&
++		       btree_id_start->btree_id < btree_id)
++			btree_id_start++;
++
++		ret = run_btree_triggers(trans, btree_id, btree_id_start);
++		if (ret)
++			return ret;
++	}
++
++	trans_for_each_update(trans, i) {
++		if (i->btree_id > BTREE_ID_alloc)
++			break;
++		if (i->btree_id == BTREE_ID_alloc) {
++			ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
++			if (ret)
++				return ret;
++			break;
++		}
++	}
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++	trans_for_each_update(trans, i)
++		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
++		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
++		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
++#endif
++	return 0;
++}
++
++static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_insert_entry *i;
++	int ret = 0;
++
++	trans_for_each_update(trans, i) {
++		/*
++		 * XXX: synchronization of cached update triggers with gc
++		 * XXX: synchronization of interior node updates with gc
++		 */
++		BUG_ON(i->cached || i->level);
++
++		if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
++			ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
++			if (ret)
++				break;
++		}
++	}
++
++	return ret;
++}
++
++static inline int
++bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
++			       struct btree_insert_entry **stopped_at,
++			       unsigned long trace_ip)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_insert_entry *i;
++	struct btree_write_buffered_key *wb;
++	struct btree_trans_commit_hook *h;
++	unsigned u64s = 0;
++	bool marking = false;
++	int ret;
++
++	if (race_fault()) {
++		trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
++		return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
++	}
++
++	/*
++	 * Check if the insert will fit in the leaf node with the write lock
++	 * held, otherwise another thread could write the node changing the
++	 * amount of space available:
++	 */
++
++	prefetch(&trans->c->journal.flags);
++
++	trans_for_each_update(trans, i) {
++		/* Multiple inserts might go to same leaf: */
++		if (!same_leaf_as_prev(trans, i))
++			u64s = 0;
++
++		u64s += i->k->k.u64s;
++		ret = !i->cached
++			? btree_key_can_insert(trans, insert_l(i)->b, u64s)
++			: btree_key_can_insert_cached(trans, flags, i->path, u64s);
++		if (ret) {
++			*stopped_at = i;
++			return ret;
++		}
++
++		if (btree_node_type_needs_gc(i->bkey_type))
++			marking = true;
++	}
++
++	if (trans->nr_wb_updates &&
++	    trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
++		return -BCH_ERR_btree_insert_need_flush_buffer;
++
++	/*
++	 * Don't get journal reservation until after we know insert will
++	 * succeed:
++	 */
++	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
++		ret = bch2_trans_journal_res_get(trans,
++				(flags & BCH_WATERMARK_MASK)|
++				JOURNAL_RES_GET_NONBLOCK);
++		if (ret)
++			return ret;
++
++		if (unlikely(trans->journal_transaction_names))
++			journal_transaction_name(trans);
++	} else {
++		trans->journal_res.seq = c->journal.replay_journal_seq;
++	}
++
++	/*
++	 * Not allowed to fail after we've gotten our journal reservation - we
++	 * have to use it:
++	 */
++
++	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
++	    !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
++		if (bch2_journal_seq_verify)
++			trans_for_each_update(trans, i)
++				i->k->k.version.lo = trans->journal_res.seq;
++		else if (bch2_inject_invalid_keys)
++			trans_for_each_update(trans, i)
++				i->k->k.version = MAX_VERSION;
++	}
++
++	if (trans->fs_usage_deltas &&
++	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
++		return -BCH_ERR_btree_insert_need_mark_replicas;
++
++	if (trans->nr_wb_updates) {
++		EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
++
++		ret = bch2_btree_insert_keys_write_buffer(trans);
++		if (ret)
++			goto revert_fs_usage;
++	}
++
++	h = trans->hooks;
++	while (h) {
++		ret = h->fn(trans, h);
++		if (ret)
++			goto revert_fs_usage;
++		h = h->next;
++	}
++
++	trans_for_each_update(trans, i)
++		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
++			ret = run_one_mem_trigger(trans, i, i->flags);
++			if (ret)
++				goto fatal_err;
++		}
++
++	if (unlikely(c->gc_pos.phase)) {
++		ret = bch2_trans_commit_run_gc_triggers(trans);
++		if  (ret)
++			goto fatal_err;
++	}
++
++	if (unlikely(trans->extra_journal_entries.nr)) {
++		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
++				  trans->extra_journal_entries.data,
++				  trans->extra_journal_entries.nr);
++
++		trans->journal_res.offset	+= trans->extra_journal_entries.nr;
++		trans->journal_res.u64s		-= trans->extra_journal_entries.nr;
++	}
++
++	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
++		struct journal *j = &c->journal;
++		struct jset_entry *entry;
++
++		trans_for_each_update(trans, i) {
++			if (i->key_cache_already_flushed)
++				continue;
++
++			if (i->flags & BTREE_UPDATE_NOJOURNAL)
++				continue;
++
++			verify_update_old_key(trans, i);
++
++			if (trans->journal_transaction_names) {
++				entry = bch2_journal_add_entry(j, &trans->journal_res,
++						       BCH_JSET_ENTRY_overwrite,
++						       i->btree_id, i->level,
++						       i->old_k.u64s);
++				bkey_reassemble(&entry->start[0],
++						(struct bkey_s_c) { &i->old_k, i->old_v });
++			}
++
++			entry = bch2_journal_add_entry(j, &trans->journal_res,
++					       BCH_JSET_ENTRY_btree_keys,
++					       i->btree_id, i->level,
++					       i->k->k.u64s);
++			bkey_copy(&entry->start[0], i->k);
++		}
++
++		trans_for_each_wb_update(trans, wb) {
++			entry = bch2_journal_add_entry(j, &trans->journal_res,
++					       BCH_JSET_ENTRY_btree_keys,
++					       wb->btree, 0,
++					       wb->k.k.u64s);
++			bkey_copy(&entry->start[0], &wb->k);
++		}
++
++		if (trans->journal_seq)
++			*trans->journal_seq = trans->journal_res.seq;
++	}
++
++	trans_for_each_update(trans, i) {
++		i->k->k.needs_whiteout = false;
++
++		if (!i->cached) {
++			u64 seq = trans->journal_res.seq;
++
++			if (i->flags & BTREE_UPDATE_PREJOURNAL)
++				seq = i->seq;
++
++			bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
++		} else if (!i->key_cache_already_flushed)
++			bch2_btree_insert_key_cached(trans, flags, i);
++		else {
++			bch2_btree_key_cache_drop(trans, i->path);
++			btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
++		}
++	}
++
++	return 0;
++fatal_err:
++	bch2_fatal_error(c);
++revert_fs_usage:
++	if (trans->fs_usage_deltas)
++		bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
++	return ret;
++}
++
++static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
++{
++	while (--i >= trans->updates) {
++		if (same_leaf_as_prev(trans, i))
++			continue;
++
++		bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
++	}
++
++	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
++	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
++}
++
++static inline int trans_lock_write(struct btree_trans *trans)
++{
++	struct btree_insert_entry *i;
++
++	trans_for_each_update(trans, i) {
++		if (same_leaf_as_prev(trans, i))
++			continue;
++
++		if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
++			return trans_lock_write_fail(trans, i);
++
++		if (!i->cached)
++			bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
++	}
++
++	return 0;
++}
++
++static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
++{
++	struct btree_insert_entry *i;
++	struct btree_write_buffered_key *wb;
++
++	trans_for_each_update(trans, i)
++		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
++
++	trans_for_each_wb_update(trans, wb)
++		bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
++}
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
++						   struct btree_insert_entry *i,
++						   struct printbuf *err)
++{
++	struct bch_fs *c = trans->c;
++	int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
++
++	printbuf_reset(err);
++	prt_printf(err, "invalid bkey on insert from %s -> %ps",
++		   trans->fn, (void *) i->ip_allocated);
++	prt_newline(err);
++	printbuf_indent_add(err, 2);
++
++	bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
++	prt_newline(err);
++
++	bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
++			  i->bkey_type, rw, err);
++	bch2_print_string_as_lines(KERN_ERR, err->buf);
++
++	bch2_inconsistent_error(c);
++	bch2_dump_trans_updates(trans);
++
++	return -EINVAL;
++}
++#endif
++
++/*
++ * Get journal reservation, take write locks, and attempt to do btree update(s):
++ */
++static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
++				       struct btree_insert_entry **stopped_at,
++				       unsigned long trace_ip)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_insert_entry *i;
++	int ret = 0, u64s_delta = 0;
++
++	trans_for_each_update(trans, i) {
++		if (i->cached)
++			continue;
++
++		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
++		u64s_delta -= i->old_btree_u64s;
++
++		if (!same_leaf_as_next(trans, i)) {
++			if (u64s_delta <= 0) {
++				ret = bch2_foreground_maybe_merge(trans, i->path,
++							i->level, flags);
++				if (unlikely(ret))
++					return ret;
++			}
++
++			u64s_delta = 0;
++		}
++	}
++
++	ret = bch2_journal_preres_get(&c->journal,
++			&trans->journal_preres, trans->journal_preres_u64s,
++			(flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
++	if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
++		ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
++	if (unlikely(ret))
++		return ret;
++
++	ret = trans_lock_write(trans);
++	if (unlikely(ret))
++		return ret;
++
++	ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
++
++	if (!ret && unlikely(trans->journal_replay_not_finished))
++		bch2_drop_overwrites_from_journal(trans);
++
++	trans_for_each_update(trans, i)
++		if (!same_leaf_as_prev(trans, i))
++			bch2_btree_node_unlock_write_inlined(trans, i->path,
++							insert_l(i)->b);
++
++	if (!ret && trans->journal_pin)
++		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
++				     trans->journal_pin, NULL);
++
++	/*
++	 * Drop journal reservation after dropping write locks, since dropping
++	 * the journal reservation may kick off a journal write:
++	 */
++	bch2_journal_res_put(&c->journal, &trans->journal_res);
++
++	if (unlikely(ret))
++		return ret;
++
++	bch2_trans_downgrade(trans);
++
++	return 0;
++}
++
++static int journal_reclaim_wait_done(struct bch_fs *c)
++{
++	int ret = bch2_journal_error(&c->journal) ?:
++		!bch2_btree_key_cache_must_wait(c);
++
++	if (!ret)
++		journal_reclaim_kick(&c->journal);
++	return ret;
++}
++
++static noinline
++int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
++			    struct btree_insert_entry *i,
++			    int ret, unsigned long trace_ip)
++{
++	struct bch_fs *c = trans->c;
++
++	switch (ret) {
++	case -BCH_ERR_btree_insert_btree_node_full:
++		ret = bch2_btree_split_leaf(trans, i->path, flags);
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
++		break;
++	case -BCH_ERR_btree_insert_need_mark_replicas:
++		ret = drop_locks_do(trans,
++			bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
++		break;
++	case -BCH_ERR_journal_res_get_blocked:
++		/*
++		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
++		 * flag
++		 */
++		if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
++		    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
++			ret = -BCH_ERR_journal_reclaim_would_deadlock;
++			break;
++		}
++
++		ret = drop_locks_do(trans,
++			bch2_trans_journal_res_get(trans,
++					(flags & BCH_WATERMARK_MASK)|
++					JOURNAL_RES_GET_CHECK));
++		break;
++	case -BCH_ERR_btree_insert_need_journal_reclaim:
++		bch2_trans_unlock(trans);
++
++		trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
++
++		wait_event_freezable(c->journal.reclaim_wait,
++				     (ret = journal_reclaim_wait_done(c)));
++		if (ret < 0)
++			break;
++
++		ret = bch2_trans_relock(trans);
++		break;
++	case -BCH_ERR_btree_insert_need_flush_buffer: {
++		struct btree_write_buffer *wb = &c->btree_write_buffer;
++
++		ret = 0;
++
++		if (wb->state.nr > wb->size * 3 / 4) {
++			bch2_trans_unlock(trans);
++			mutex_lock(&wb->flush_lock);
++
++			if (wb->state.nr > wb->size * 3 / 4) {
++				bch2_trans_begin(trans);
++				ret = __bch2_btree_write_buffer_flush(trans,
++						flags|BTREE_INSERT_NOCHECK_RW, true);
++				if (!ret) {
++					trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
++					ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
++				}
++			} else {
++				mutex_unlock(&wb->flush_lock);
++				ret = bch2_trans_relock(trans);
++			}
++		}
++		break;
++	}
++	default:
++		BUG_ON(ret >= 0);
++		break;
++	}
++
++	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
++
++	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
++				!(flags & BTREE_INSERT_NOWAIT) &&
++				(flags & BTREE_INSERT_NOFAIL), c,
++		"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
++
++	return ret;
++}
++
++static noinline int
++bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	int ret;
++
++	if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
++	    test_bit(BCH_FS_STARTED, &c->flags))
++		return -BCH_ERR_erofs_trans_commit;
++
++	ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
++	if (ret)
++		return ret;
++
++	bch2_write_ref_get(c, BCH_WRITE_REF_trans);
++	return 0;
++}
++
++/*
++ * This is for updates done in the early part of fsck - btree_gc - before we've
++ * gone RW. we only add the new key to the list of keys for journal replay to
++ * do.
++ */
++static noinline int
++do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_insert_entry *i;
++	int ret = 0;
++
++	trans_for_each_update(trans, i) {
++		ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
++		if (ret)
++			break;
++	}
++
++	return ret;
++}
++
++int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_insert_entry *i = NULL;
++	struct btree_write_buffered_key *wb;
++	unsigned u64s;
++	int ret = 0;
++
++	if (!trans->nr_updates &&
++	    !trans->nr_wb_updates &&
++	    !trans->extra_journal_entries.nr)
++		goto out_reset;
++
++	if (flags & BTREE_INSERT_GC_LOCK_HELD)
++		lockdep_assert_held(&c->gc_lock);
++
++	ret = bch2_trans_commit_run_triggers(trans);
++	if (ret)
++		goto out_reset;
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++	trans_for_each_update(trans, i) {
++		struct printbuf buf = PRINTBUF;
++		enum bkey_invalid_flags invalid_flags = 0;
++
++		if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
++			invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
++
++		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
++					       i->bkey_type, invalid_flags, &buf)))
++			ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
++		btree_insert_entry_checks(trans, i);
++		printbuf_exit(&buf);
++
++		if (ret)
++			return ret;
++	}
++#endif
++
++	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
++		ret = do_bch2_trans_commit_to_journal_replay(trans);
++		goto out_reset;
++	}
++
++	if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
++	    unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
++		ret = bch2_trans_commit_get_rw_cold(trans, flags);
++		if (ret)
++			goto out_reset;
++	}
++
++	if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
++	    mutex_trylock(&c->btree_write_buffer.flush_lock)) {
++		bch2_trans_begin(trans);
++		bch2_trans_unlock(trans);
++
++		ret = __bch2_btree_write_buffer_flush(trans,
++					flags|BTREE_INSERT_NOCHECK_RW, true);
++		if (!ret) {
++			trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
++			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
++		}
++		goto out;
++	}
++
++	EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
++
++	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
++
++	trans->journal_u64s		= trans->extra_journal_entries.nr;
++	trans->journal_preres_u64s	= 0;
++
++	trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
++
++	if (trans->journal_transaction_names)
++		trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
++
++	trans_for_each_update(trans, i) {
++		EBUG_ON(!i->path->should_be_locked);
++
++		ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
++		if (unlikely(ret))
++			goto out;
++
++		EBUG_ON(!btree_node_intent_locked(i->path, i->level));
++
++		if (i->key_cache_already_flushed)
++			continue;
++
++		/* we're going to journal the key being updated: */
++		u64s = jset_u64s(i->k->k.u64s);
++		if (i->cached &&
++		    likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
++			trans->journal_preres_u64s += u64s;
++
++		if (i->flags & BTREE_UPDATE_NOJOURNAL)
++			continue;
++
++		trans->journal_u64s += u64s;
++
++		/* and we're also going to log the overwrite: */
++		if (trans->journal_transaction_names)
++			trans->journal_u64s += jset_u64s(i->old_k.u64s);
++	}
++
++	trans_for_each_wb_update(trans, wb)
++		trans->journal_u64s += jset_u64s(wb->k.k.u64s);
++
++	if (trans->extra_journal_res) {
++		ret = bch2_disk_reservation_add(c, trans->disk_res,
++				trans->extra_journal_res,
++				(flags & BTREE_INSERT_NOFAIL)
++				? BCH_DISK_RESERVATION_NOFAIL : 0);
++		if (ret)
++			goto err;
++	}
++retry:
++	bch2_trans_verify_not_in_restart(trans);
++	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
++
++	ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
++
++	/* make sure we didn't drop or screw up locks: */
++	bch2_trans_verify_locks(trans);
++
++	if (ret)
++		goto err;
++
++	trace_and_count(c, transaction_commit, trans, _RET_IP_);
++out:
++	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
++
++	if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
++		bch2_write_ref_put(c, BCH_WRITE_REF_trans);
++out_reset:
++	bch2_trans_reset_updates(trans);
++
++	return ret;
++err:
++	ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
++	if (ret)
++		goto out;
++
++	goto retry;
++}
 diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
 new file mode 100644
-index 000000000..d95360160
+index 000000000..71ad3893e
 --- /dev/null
 +++ b/fs/bcachefs/btree_types.h
-@@ -0,0 +1,743 @@
+@@ -0,0 +1,746 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_TYPES_H
 +#define _BCACHEFS_BTREE_TYPES_H
 +
 +#include <linux/list.h>
 +#include <linux/rhashtable.h>
-+#include <linux/six.h>
 +
 +//#include "bkey_methods.h"
 +#include "buckets_types.h"
@@ -29132,6 +30634,7 @@ index 000000000..d95360160
 +#include "errcode.h"
 +#include "journal_types.h"
 +#include "replicas_types.h"
++#include "six.h"
 +
 +struct open_bucket;
 +struct btree_update;
@@ -29763,7 +31266,7 @@ index 000000000..d95360160
 +}
 +
 +enum btree_node_type {
-+#define x(kwd, val) BKEY_TYPE_##kwd = val,
++#define x(kwd, val, ...) BKEY_TYPE_##kwd = val,
 +	BCH_BTREE_IDS()
 +#undef x
 +	BKEY_TYPE_btree,
@@ -29782,31 +31285,37 @@ index 000000000..d95360160
 +}
 +
 +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS		\
-+	((1U << BKEY_TYPE_extents)|			\
-+	 (1U << BKEY_TYPE_alloc)|			\
-+	 (1U << BKEY_TYPE_inodes)|			\
-+	 (1U << BKEY_TYPE_stripes)|			\
-+	 (1U << BKEY_TYPE_reflink)|			\
-+	 (1U << BKEY_TYPE_btree))
++	(BIT(BKEY_TYPE_extents)|			\
++	 BIT(BKEY_TYPE_alloc)|				\
++	 BIT(BKEY_TYPE_inodes)|				\
++	 BIT(BKEY_TYPE_stripes)|			\
++	 BIT(BKEY_TYPE_reflink)|			\
++	 BIT(BKEY_TYPE_btree))
 +
 +#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS		\
-+	((1U << BKEY_TYPE_alloc)|			\
-+	 (1U << BKEY_TYPE_inodes)|			\
-+	 (1U << BKEY_TYPE_stripes)|			\
-+	 (1U << BKEY_TYPE_snapshots))
++	(BIT(BKEY_TYPE_alloc)|				\
++	 BIT(BKEY_TYPE_inodes)|				\
++	 BIT(BKEY_TYPE_stripes)|			\
++	 BIT(BKEY_TYPE_snapshots))
 +
 +#define BTREE_NODE_TYPE_HAS_TRIGGERS			\
 +	(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|		\
 +	 BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
 +
-+#define BTREE_ID_IS_EXTENTS				\
-+	((1U << BTREE_ID_extents)|			\
-+	 (1U << BTREE_ID_reflink)|			\
-+	 (1U << BTREE_ID_freespace))
++static inline bool btree_node_type_needs_gc(enum btree_node_type type)
++{
++	return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
++}
 +
 +static inline bool btree_node_type_is_extents(enum btree_node_type type)
 +{
-+	return (1U << type) & BTREE_ID_IS_EXTENTS;
++	const unsigned mask = 0
++#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_EXTENTS)) << nr)
++	BCH_BTREE_IDS()
++#undef x
++	;
++
++	return (1U << type) & mask;
 +}
 +
 +static inline bool btree_id_is_extents(enum btree_id btree)
@@ -29814,29 +31323,26 @@ index 000000000..d95360160
 +	return btree_node_type_is_extents((enum btree_node_type) btree);
 +}
 +
-+#define BTREE_ID_HAS_SNAPSHOTS				\
-+	((1U << BTREE_ID_extents)|			\
-+	 (1U << BTREE_ID_inodes)|			\
-+	 (1U << BTREE_ID_dirents)|			\
-+	 (1U << BTREE_ID_xattrs))
-+
-+#define BTREE_ID_HAS_PTRS				\
-+	((1U << BTREE_ID_extents)|			\
-+	 (1U << BTREE_ID_reflink))
-+
 +static inline bool btree_type_has_snapshots(enum btree_id id)
 +{
-+	return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
++	const unsigned mask = 0
++#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr)
++	BCH_BTREE_IDS()
++#undef x
++	;
++
++	return (1U << id) & mask;
 +}
 +
 +static inline bool btree_type_has_ptrs(enum btree_id id)
 +{
-+	return (1 << id) & BTREE_ID_HAS_PTRS;
-+}
++	const unsigned mask = 0
++#define x(name, nr, flags, ...)	|((!!((flags) & BTREE_ID_DATA)) << nr)
++	BCH_BTREE_IDS()
++#undef x
++	;
 +
-+static inline bool btree_node_type_needs_gc(enum btree_node_type type)
-+{
-+	return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
++	return (1U << id) & mask;
 +}
 +
 +struct btree_root {
@@ -29861,12 +31367,916 @@ index 000000000..d95360160
 +};
 +
 +#endif /* _BCACHEFS_BTREE_TYPES_H */
+diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
+new file mode 100644
+index 000000000..880ce7431
+--- /dev/null
++++ b/fs/bcachefs/btree_update.c
+@@ -0,0 +1,898 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "btree_update.h"
++#include "btree_iter.h"
++#include "btree_journal_iter.h"
++#include "btree_locking.h"
++#include "buckets.h"
++#include "debug.h"
++#include "errcode.h"
++#include "error.h"
++#include "extents.h"
++#include "keylist.h"
++#include "snapshot.h"
++#include "trace.h"
++
++static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
++					 const struct btree_insert_entry *r)
++{
++	return   cmp_int(l->btree_id,	r->btree_id) ?:
++		 cmp_int(l->cached,	r->cached) ?:
++		 -cmp_int(l->level,	r->level) ?:
++		 bpos_cmp(l->k->k.p,	r->k->k.p);
++}
++
++static int __must_check
++bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
++			  struct bkey_i *, enum btree_update_flags,
++			  unsigned long ip);
++
++static noinline int extent_front_merge(struct btree_trans *trans,
++				       struct btree_iter *iter,
++				       struct bkey_s_c k,
++				       struct bkey_i **insert,
++				       enum btree_update_flags flags)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_i *update;
++	int ret;
++
++	update = bch2_bkey_make_mut_noupdate(trans, k);
++	ret = PTR_ERR_OR_ZERO(update);
++	if (ret)
++		return ret;
++
++	if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
++		return 0;
++
++	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?:
++		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p);
++	if (ret < 0)
++		return ret;
++	if (ret)
++		return 0;
++
++	ret = bch2_btree_delete_at(trans, iter, flags);
++	if (ret)
++		return ret;
++
++	*insert = update;
++	return 0;
++}
++
++static noinline int extent_back_merge(struct btree_trans *trans,
++				      struct btree_iter *iter,
++				      struct bkey_i *insert,
++				      struct bkey_s_c k)
++{
++	struct bch_fs *c = trans->c;
++	int ret;
++
++	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
++		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
++	if (ret < 0)
++		return ret;
++	if (ret)
++		return 0;
++
++	bch2_bkey_merge(c, bkey_i_to_s(insert), k);
++	return 0;
++}
++
++/*
++ * When deleting, check if we need to emit a whiteout (because we're overwriting
++ * something in an ancestor snapshot)
++ */
++static int need_whiteout_for_snapshot(struct btree_trans *trans,
++				      enum btree_id btree_id, struct bpos pos)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	u32 snapshot = pos.snapshot;
++	int ret;
++
++	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
++		return 0;
++
++	pos.snapshot++;
++
++	for_each_btree_key_norestart(trans, iter, btree_id, pos,
++			   BTREE_ITER_ALL_SNAPSHOTS|
++			   BTREE_ITER_NOPRESERVE, k, ret) {
++		if (!bkey_eq(k.k->p, pos))
++			break;
++
++		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
++					      k.k->p.snapshot)) {
++			ret = !bkey_whiteout(k.k);
++			break;
++		}
++	}
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ret;
++}
++
++int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
++				   enum btree_id id,
++				   struct bpos old_pos,
++				   struct bpos new_pos)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter old_iter, new_iter = { NULL };
++	struct bkey_s_c old_k, new_k;
++	snapshot_id_list s;
++	struct bkey_i *update;
++	int ret;
++
++	if (!bch2_snapshot_has_children(c, old_pos.snapshot))
++		return 0;
++
++	darray_init(&s);
++
++	bch2_trans_iter_init(trans, &old_iter, id, old_pos,
++			     BTREE_ITER_NOT_EXTENTS|
++			     BTREE_ITER_ALL_SNAPSHOTS);
++	while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
++	       !(ret = bkey_err(old_k)) &&
++	       bkey_eq(old_pos, old_k.k->p)) {
++		struct bpos whiteout_pos =
++			SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
++
++		if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
++		    snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
++			continue;
++
++		new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
++					   BTREE_ITER_NOT_EXTENTS|
++					   BTREE_ITER_INTENT);
++		ret = bkey_err(new_k);
++		if (ret)
++			break;
++
++		if (new_k.k->type == KEY_TYPE_deleted) {
++			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
++			ret = PTR_ERR_OR_ZERO(update);
++			if (ret)
++				break;
++
++			bkey_init(&update->k);
++			update->k.p		= whiteout_pos;
++			update->k.type		= KEY_TYPE_whiteout;
++
++			ret = bch2_trans_update(trans, &new_iter, update,
++						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++		}
++		bch2_trans_iter_exit(trans, &new_iter);
++
++		ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
++		if (ret)
++			break;
++	}
++	bch2_trans_iter_exit(trans, &new_iter);
++	bch2_trans_iter_exit(trans, &old_iter);
++	darray_exit(&s);
++
++	return ret;
++}
++
++int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
++				       struct btree_iter *iter,
++				       enum btree_update_flags flags,
++				       struct bkey_s_c old,
++				       struct bkey_s_c new)
++{
++	enum btree_id btree_id = iter->btree_id;
++	struct bkey_i *update;
++	struct bpos new_start = bkey_start_pos(new.k);
++	bool front_split = bkey_lt(bkey_start_pos(old.k), new_start);
++	bool back_split  = bkey_gt(old.k->p, new.k->p);
++	int ret = 0, compressed_sectors;
++
++	/*
++	 * If we're going to be splitting a compressed extent, note it
++	 * so that __bch2_trans_commit() can increase our disk
++	 * reservation:
++	 */
++	if (((front_split && back_split) ||
++	     ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) &&
++	    (compressed_sectors = bch2_bkey_sectors_compressed(old)))
++		trans->extra_journal_res += compressed_sectors;
++
++	if (front_split) {
++		update = bch2_bkey_make_mut_noupdate(trans, old);
++		if ((ret = PTR_ERR_OR_ZERO(update)))
++			return ret;
++
++		bch2_cut_back(new_start, update);
++
++		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
++					old.k->p, update->k.p) ?:
++			bch2_btree_insert_nonextent(trans, btree_id, update,
++					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
++		if (ret)
++			return ret;
++	}
++
++	/* If we're overwriting in a different snapshot - middle split: */
++	if (old.k->p.snapshot != new.k->p.snapshot &&
++	    (front_split || back_split)) {
++		update = bch2_bkey_make_mut_noupdate(trans, old);
++		if ((ret = PTR_ERR_OR_ZERO(update)))
++			return ret;
++
++		bch2_cut_front(new_start, update);
++		bch2_cut_back(new.k->p, update);
++
++		ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
++					old.k->p, update->k.p) ?:
++			bch2_btree_insert_nonextent(trans, btree_id, update,
++					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
++		if (ret)
++			return ret;
++	}
++
++	if (bkey_le(old.k->p, new.k->p)) {
++		update = bch2_trans_kmalloc(trans, sizeof(*update));
++		if ((ret = PTR_ERR_OR_ZERO(update)))
++			return ret;
++
++		bkey_init(&update->k);
++		update->k.p = old.k->p;
++		update->k.p.snapshot = new.k->p.snapshot;
++
++		if (new.k->p.snapshot != old.k->p.snapshot) {
++			update->k.type = KEY_TYPE_whiteout;
++		} else if (btree_type_has_snapshots(btree_id)) {
++			ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
++			if (ret < 0)
++				return ret;
++			if (ret)
++				update->k.type = KEY_TYPE_whiteout;
++		}
++
++		ret = bch2_btree_insert_nonextent(trans, btree_id, update,
++					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
++		if (ret)
++			return ret;
++	}
++
++	if (back_split) {
++		update = bch2_bkey_make_mut_noupdate(trans, old);
++		if ((ret = PTR_ERR_OR_ZERO(update)))
++			return ret;
++
++		bch2_cut_front(new.k->p, update);
++
++		ret = bch2_trans_update_by_path(trans, iter->path, update,
++					  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
++					  flags, _RET_IP_);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++static int bch2_trans_update_extent(struct btree_trans *trans,
++				    struct btree_iter *orig_iter,
++				    struct bkey_i *insert,
++				    enum btree_update_flags flags)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	enum btree_id btree_id = orig_iter->btree_id;
++	int ret = 0;
++
++	bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
++			     BTREE_ITER_INTENT|
++			     BTREE_ITER_WITH_UPDATES|
++			     BTREE_ITER_NOT_EXTENTS);
++	k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
++	if ((ret = bkey_err(k)))
++		goto err;
++	if (!k.k)
++		goto out;
++
++	if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
++		if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
++			ret = extent_front_merge(trans, &iter, k, &insert, flags);
++			if (ret)
++				goto err;
++		}
++
++		goto next;
++	}
++
++	while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
++		bool done = bkey_lt(insert->k.p, k.k->p);
++
++		ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
++		if (ret)
++			goto err;
++
++		if (done)
++			goto out;
++next:
++		bch2_btree_iter_advance(&iter);
++		k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
++		if ((ret = bkey_err(k)))
++			goto err;
++		if (!k.k)
++			goto out;
++	}
++
++	if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
++		ret = extent_back_merge(trans, &iter, insert, k);
++		if (ret)
++			goto err;
++	}
++out:
++	if (!bkey_deleted(&insert->k))
++		ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ret;
++}
++
++static noinline int flush_new_cached_update(struct btree_trans *trans,
++					    struct btree_path *path,
++					    struct btree_insert_entry *i,
++					    enum btree_update_flags flags,
++					    unsigned long ip)
++{
++	struct btree_path *btree_path;
++	struct bkey k;
++	int ret;
++
++	btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
++				   BTREE_ITER_INTENT, _THIS_IP_);
++	ret = bch2_btree_path_traverse(trans, btree_path, 0);
++	if (ret)
++		goto out;
++
++	/*
++	 * The old key in the insert entry might actually refer to an existing
++	 * key in the btree that has been deleted from cache and not yet
++	 * flushed. Check for this and skip the flush so we don't run triggers
++	 * against a stale key.
++	 */
++	bch2_btree_path_peek_slot_exact(btree_path, &k);
++	if (!bkey_deleted(&k))
++		goto out;
++
++	i->key_cache_already_flushed = true;
++	i->flags |= BTREE_TRIGGER_NORUN;
++
++	btree_path_set_should_be_locked(btree_path);
++	ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
++out:
++	bch2_path_put(trans, btree_path, true);
++	return ret;
++}
++
++static int __must_check
++bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
++			  struct bkey_i *k, enum btree_update_flags flags,
++			  unsigned long ip)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_insert_entry *i, n;
++	u64 seq = 0;
++	int cmp;
++
++	EBUG_ON(!path->should_be_locked);
++	EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
++	EBUG_ON(!bpos_eq(k->k.p, path->pos));
++
++	/*
++	 * The transaction journal res hasn't been allocated at this point.
++	 * That occurs at commit time. Reuse the seq field to pass in the seq
++	 * of a prejournaled key.
++	 */
++	if (flags & BTREE_UPDATE_PREJOURNAL)
++		seq = trans->journal_res.seq;
++
++	n = (struct btree_insert_entry) {
++		.flags		= flags,
++		.bkey_type	= __btree_node_type(path->level, path->btree_id),
++		.btree_id	= path->btree_id,
++		.level		= path->level,
++		.cached		= path->cached,
++		.path		= path,
++		.k		= k,
++		.seq		= seq,
++		.ip_allocated	= ip,
++	};
++
++#ifdef CONFIG_BCACHEFS_DEBUG
++	trans_for_each_update(trans, i)
++		BUG_ON(i != trans->updates &&
++		       btree_insert_entry_cmp(i - 1, i) >= 0);
++#endif
++
++	/*
++	 * Pending updates are kept sorted: first, find position of new update,
++	 * then delete/trim any updates the new update overwrites:
++	 */
++	trans_for_each_update(trans, i) {
++		cmp = btree_insert_entry_cmp(&n, i);
++		if (cmp <= 0)
++			break;
++	}
++
++	if (!cmp && i < trans->updates + trans->nr_updates) {
++		EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
++
++		bch2_path_put(trans, i->path, true);
++		i->flags	= n.flags;
++		i->cached	= n.cached;
++		i->k		= n.k;
++		i->path		= n.path;
++		i->seq		= n.seq;
++		i->ip_allocated	= n.ip_allocated;
++	} else {
++		array_insert_item(trans->updates, trans->nr_updates,
++				  i - trans->updates, n);
++
++		i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
++		i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
++
++		if (unlikely(trans->journal_replay_not_finished)) {
++			struct bkey_i *j_k =
++				bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
++
++			if (j_k) {
++				i->old_k = j_k->k;
++				i->old_v = &j_k->v;
++			}
++		}
++	}
++
++	__btree_path_get(i->path, true);
++
++	/*
++	 * If a key is present in the key cache, it must also exist in the
++	 * btree - this is necessary for cache coherency. When iterating over
++	 * a btree that's cached in the key cache, the btree iter code checks
++	 * the key cache - but the key has to exist in the btree for that to
++	 * work:
++	 */
++	if (path->cached && bkey_deleted(&i->old_k))
++		return flush_new_cached_update(trans, path, i, flags, ip);
++
++	return 0;
++}
++
++int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
++				   struct bkey_i *k, enum btree_update_flags flags)
++{
++	struct btree_path *path = iter->update_path ?: iter->path;
++	struct bkey_cached *ck;
++	int ret;
++
++	if (iter->flags & BTREE_ITER_IS_EXTENTS)
++		return bch2_trans_update_extent(trans, iter, k, flags);
++
++	if (bkey_deleted(&k->k) &&
++	    !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
++	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
++		ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
++		if (unlikely(ret < 0))
++			return ret;
++
++		if (ret)
++			k->k.type = KEY_TYPE_whiteout;
++	}
++
++	/*
++	 * Ensure that updates to cached btrees go to the key cache:
++	 */
++	if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
++	    !path->cached &&
++	    !path->level &&
++	    btree_id_cached(trans->c, path->btree_id)) {
++		if (!iter->key_cache_path ||
++		    !iter->key_cache_path->should_be_locked ||
++		    !bpos_eq(iter->key_cache_path->pos, k->k.p)) {
++			if (!iter->key_cache_path)
++				iter->key_cache_path =
++					bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
++						      BTREE_ITER_INTENT|
++						      BTREE_ITER_CACHED, _THIS_IP_);
++
++			iter->key_cache_path =
++				bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
++							iter->flags & BTREE_ITER_INTENT,
++							_THIS_IP_);
++
++			ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
++						       BTREE_ITER_CACHED);
++			if (unlikely(ret))
++				return ret;
++
++			ck = (void *) iter->key_cache_path->l[0].b;
++
++			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
++				trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
++				return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
++			}
++
++			btree_path_set_should_be_locked(iter->key_cache_path);
++		}
++
++		path = iter->key_cache_path;
++	}
++
++	return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
++}
++
++/*
++ * Add a transaction update for a key that has already been journaled.
++ */
++int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
++				       struct btree_iter *iter, struct bkey_i *k,
++				       enum btree_update_flags flags)
++{
++	trans->journal_res.seq = seq;
++	return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
++						 BTREE_UPDATE_PREJOURNAL);
++}
++
++int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
++					    enum btree_id btree,
++					    struct bkey_i *k)
++{
++	struct btree_write_buffered_key *i;
++	int ret;
++
++	EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
++	EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
++
++	trans_for_each_wb_update(trans, i) {
++		if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
++			bkey_copy(&i->k, k);
++			return 0;
++		}
++	}
++
++	if (!trans->wb_updates ||
++	    trans->nr_wb_updates == trans->wb_updates_size) {
++		struct btree_write_buffered_key *u;
++
++		if (trans->nr_wb_updates == trans->wb_updates_size) {
++			struct btree_transaction_stats *s = btree_trans_stats(trans);
++
++			BUG_ON(trans->wb_updates_size > U8_MAX / 2);
++			trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
++			if (s)
++				s->wb_updates_size = trans->wb_updates_size;
++		}
++
++		u = bch2_trans_kmalloc_nomemzero(trans,
++					trans->wb_updates_size *
++					sizeof(struct btree_write_buffered_key));
++		ret = PTR_ERR_OR_ZERO(u);
++		if (ret)
++			return ret;
++
++		if (trans->nr_wb_updates)
++			memcpy(u, trans->wb_updates, trans->nr_wb_updates *
++			       sizeof(struct btree_write_buffered_key));
++		trans->wb_updates = u;
++	}
++
++	trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
++		.btree	= btree,
++	};
++
++	bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
++	trans->nr_wb_updates++;
++
++	return 0;
++}
++
++int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
++			     enum btree_id btree, struct bpos end)
++{
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
++	k = bch2_btree_iter_prev(iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	bch2_btree_iter_advance(iter);
++	k = bch2_btree_iter_peek_slot(iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	BUG_ON(k.k->type != KEY_TYPE_deleted);
++
++	if (bkey_gt(k.k->p, end)) {
++		ret = -BCH_ERR_ENOSPC_btree_slot;
++		goto err;
++	}
++
++	return 0;
++err:
++	bch2_trans_iter_exit(trans, iter);
++	return ret;
++}
++
++void bch2_trans_commit_hook(struct btree_trans *trans,
++			    struct btree_trans_commit_hook *h)
++{
++	h->next = trans->hooks;
++	trans->hooks = h;
++}
++
++int bch2_btree_insert_nonextent(struct btree_trans *trans,
++				enum btree_id btree, struct bkey_i *k,
++				enum btree_update_flags flags)
++{
++	struct btree_iter iter;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, btree, k->k.p,
++			     BTREE_ITER_NOT_EXTENTS|
++			     BTREE_ITER_INTENT);
++	ret   = bch2_btree_iter_traverse(&iter) ?:
++		bch2_trans_update(trans, &iter, k, flags);
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
++			struct bkey_i *k, enum btree_update_flags flags)
++{
++	struct btree_iter iter;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
++			     BTREE_ITER_CACHED|
++			     BTREE_ITER_INTENT);
++	ret   = bch2_btree_iter_traverse(&iter) ?:
++		bch2_trans_update(trans, &iter, k, flags);
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++/**
++ * bch2_btree_insert - insert keys into the extent btree
++ * @c:			pointer to struct bch_fs
++ * @id:			btree to insert into
++ * @insert_keys:	list of keys to insert
++ * @hook:		insert callback
++ */
++int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
++		      struct bkey_i *k,
++		      struct disk_reservation *disk_res,
++		      u64 *journal_seq, int flags)
++{
++	return bch2_trans_do(c, disk_res, journal_seq, flags,
++			     __bch2_btree_insert(&trans, id, k, 0));
++}
++
++int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
++				unsigned len, unsigned update_flags)
++{
++	struct bkey_i *k;
++
++	k = bch2_trans_kmalloc(trans, sizeof(*k));
++	if (IS_ERR(k))
++		return PTR_ERR(k);
++
++	bkey_init(&k->k);
++	k->k.p = iter->pos;
++	bch2_key_resize(&k->k, len);
++	return bch2_trans_update(trans, iter, k, update_flags);
++}
++
++int bch2_btree_delete_at(struct btree_trans *trans,
++			 struct btree_iter *iter, unsigned update_flags)
++{
++	return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
++}
++
++int bch2_btree_delete_at_buffered(struct btree_trans *trans,
++				  enum btree_id btree, struct bpos pos)
++{
++	struct bkey_i *k;
++
++	k = bch2_trans_kmalloc(trans, sizeof(*k));
++	if (IS_ERR(k))
++		return PTR_ERR(k);
++
++	bkey_init(&k->k);
++	k->k.p = pos;
++	return bch2_trans_update_buffered(trans, btree, k);
++}
++
++int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
++				  struct bpos start, struct bpos end,
++				  unsigned update_flags,
++				  u64 *journal_seq)
++{
++	u32 restart_count = trans->restart_count;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
++	while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
++		struct disk_reservation disk_res =
++			bch2_disk_reservation_init(trans->c, 0);
++		struct bkey_i delete;
++
++		ret = bkey_err(k);
++		if (ret)
++			goto err;
++
++		bkey_init(&delete.k);
++
++		/*
++		 * This could probably be more efficient for extents:
++		 */
++
++		/*
++		 * For extents, iter.pos won't necessarily be the same as
++		 * bkey_start_pos(k.k) (for non extents they always will be the
++		 * same). It's important that we delete starting from iter.pos
++		 * because the range we want to delete could start in the middle
++		 * of k.
++		 *
++		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
++		 * bkey_start_pos(k.k)).
++		 */
++		delete.k.p = iter.pos;
++
++		if (iter.flags & BTREE_ITER_IS_EXTENTS)
++			bch2_key_resize(&delete.k,
++					bpos_min(end, k.k->p).offset -
++					iter.pos.offset);
++
++		ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
++			bch2_trans_commit(trans, &disk_res, journal_seq,
++					  BTREE_INSERT_NOFAIL);
++		bch2_disk_reservation_put(trans->c, &disk_res);
++err:
++		/*
++		 * the bch2_trans_begin() call is in a weird place because we
++		 * need to call it after every transaction commit, to avoid path
++		 * overflow, but don't want to call it if the delete operation
++		 * is a no-op and we have no work to do:
++		 */
++		bch2_trans_begin(trans);
++
++		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++			ret = 0;
++		if (ret)
++			break;
++	}
++	bch2_trans_iter_exit(trans, &iter);
++
++	if (!ret && trans_was_restarted(trans, restart_count))
++		ret = -BCH_ERR_transaction_restart_nested;
++	return ret;
++}
++
++/*
++ * bch_btree_delete_range - delete everything within a given range
++ *
++ * Range is a half open interval - [start, end)
++ */
++int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
++			    struct bpos start, struct bpos end,
++			    unsigned update_flags,
++			    u64 *journal_seq)
++{
++	int ret = bch2_trans_run(c,
++			bch2_btree_delete_range_trans(&trans, id, start, end,
++						      update_flags, journal_seq));
++	if (ret == -BCH_ERR_transaction_restart_nested)
++		ret = 0;
++	return ret;
++}
++
++int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
++		       struct bpos pos, bool set)
++{
++	struct bkey_i *k;
++	int ret = 0;
++
++	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
++	ret = PTR_ERR_OR_ZERO(k);
++	if (unlikely(ret))
++		return ret;
++
++	bkey_init(&k->k);
++	k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
++	k->k.p = pos;
++
++	return bch2_trans_update_buffered(trans, btree, k);
++}
++
++static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
++{
++	struct printbuf buf = PRINTBUF;
++	struct jset_entry_log *l;
++	unsigned u64s;
++	int ret;
++
++	prt_vprintf(&buf, fmt, args);
++	ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
++	if (ret)
++		goto err;
++
++	u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
++
++	ret = darray_make_room(entries, jset_u64s(u64s));
++	if (ret)
++		goto err;
++
++	l = (void *) &darray_top(*entries);
++	l->entry.u64s		= cpu_to_le16(u64s);
++	l->entry.btree_id	= 0;
++	l->entry.level		= 1;
++	l->entry.type		= BCH_JSET_ENTRY_log;
++	l->entry.pad[0]		= 0;
++	l->entry.pad[1]		= 0;
++	l->entry.pad[2]		= 0;
++	memcpy(l->d, buf.buf, buf.pos);
++	while (buf.pos & 7)
++		l->d[buf.pos++] = '\0';
++
++	entries->nr += jset_u64s(u64s);
++err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++static int
++__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
++		  va_list args)
++{
++	int ret;
++
++	if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
++		ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
++	} else {
++		ret = bch2_trans_do(c, NULL, NULL,
++			BTREE_INSERT_LAZY_RW|commit_flags,
++			__bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
++	}
++
++	return ret;
++}
++
++int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
++{
++	va_list args;
++	int ret;
++
++	va_start(args, fmt);
++	ret = __bch2_fs_log_msg(c, 0, fmt, args);
++	va_end(args);
++	return ret;
++}
++
++/*
++ * Use for logging messages during recovery to enable reserved space and avoid
++ * blocking.
++ */
++int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
++{
++	va_list args;
++	int ret;
++
++	va_start(args, fmt);
++	ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
++	va_end(args);
++	return ret;
++}
 diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
 new file mode 100644
-index 000000000..d6aec9341
+index 000000000..901c42b57
 --- /dev/null
 +++ b/fs/bcachefs/btree_update.h
-@@ -0,0 +1,352 @@
+@@ -0,0 +1,353 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_BTREE_UPDATE_H
 +#define _BCACHEFS_BTREE_UPDATE_H
@@ -29965,8 +32375,9 @@ index 000000000..d6aec9341
 +	return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos);
 +}
 +
-+int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
-+			     struct bkey_i *, enum btree_update_flags);
++int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
++				       enum btree_update_flags,
++				       struct bkey_s_c, struct bkey_s_c);
 +
 +int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
 +			     enum btree_id, struct bpos);
@@ -30136,10 +32547,10 @@ index 000000000..d6aec9341
 +{
 +	struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
 +				btree_id, pos, flags|BTREE_ITER_INTENT, type);
-+	struct bkey_i *ret = unlikely(IS_ERR(k.k))
++	struct bkey_i *ret = IS_ERR(k.k)
 +		? ERR_CAST(k.k)
 +		: __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
-+	if (unlikely(IS_ERR(ret)))
++	if (IS_ERR(ret))
 +		bch2_trans_iter_exit(trans, iter);
 +	return ret;
 +}
@@ -30221,7 +32632,7 @@ index 000000000..d6aec9341
 +#endif /* _BCACHEFS_BTREE_UPDATE_H */
 diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
 new file mode 100644
-index 000000000..3659b2c08
+index 000000000..c741150e6
 --- /dev/null
 +++ b/fs/bcachefs/btree_update_interior.c
 @@ -0,0 +1,2488 @@
@@ -30232,6 +32643,7 @@ index 000000000..3659b2c08
 +#include "bkey_methods.h"
 +#include "btree_cache.h"
 +#include "btree_gc.h"
++#include "btree_journal_iter.h"
 +#include "btree_update.h"
 +#include "btree_update_interior.h"
 +#include "btree_io.h"
@@ -30244,7 +32656,6 @@ index 000000000..3659b2c08
 +#include "journal.h"
 +#include "journal_reclaim.h"
 +#include "keylist.h"
-+#include "recovery.h"
 +#include "replicas.h"
 +#include "super-io.h"
 +#include "trace.h"
@@ -30415,7 +32826,7 @@ index 000000000..3659b2c08
 +	bch2_btree_node_hash_remove(&c->btree_cache, b);
 +	__btree_node_free(c, b);
 +	six_unlock_write(&b->c.lock);
-+	mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent);
++	mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
 +
 +	trans_for_each_path(trans, path)
 +		if (path->l[level].b == b) {
@@ -30947,7 +33358,7 @@ index 000000000..3659b2c08
 +
 +		mutex_unlock(&c->btree_interior_update_lock);
 +
-+		mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent);
++		mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
 +		six_unlock_write(&b->c.lock);
 +
 +		btree_node_write_if_need(c, b, SIX_LOCK_intent);
@@ -32612,7 +35023,7 @@ index 000000000..3659b2c08
 +		       as,
 +		       as->mode,
 +		       as->nodes_written,
-+		       atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
++		       closure_nr_remaining(&as->cl),
 +		       as->journal.seq);
 +	mutex_unlock(&c->btree_interior_update_lock);
 +}
@@ -33056,2115 +35467,12 @@ index 000000000..5e0a467fe
 +int bch2_fs_btree_interior_update_init(struct bch_fs *);
 +
 +#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
-diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
-new file mode 100644
-index 000000000..368972a00
---- /dev/null
-+++ b/fs/bcachefs/btree_update_leaf.c
-@@ -0,0 +1,2097 @@
-+// SPDX-License-Identifier: GPL-2.0
-+
-+#include "bcachefs.h"
-+#include "btree_update.h"
-+#include "btree_update_interior.h"
-+#include "btree_gc.h"
-+#include "btree_io.h"
-+#include "btree_iter.h"
-+#include "btree_key_cache.h"
-+#include "btree_locking.h"
-+#include "btree_write_buffer.h"
-+#include "buckets.h"
-+#include "debug.h"
-+#include "errcode.h"
-+#include "error.h"
-+#include "extent_update.h"
-+#include "journal.h"
-+#include "journal_reclaim.h"
-+#include "keylist.h"
-+#include "recovery.h"
-+#include "subvolume.h"
-+#include "replicas.h"
-+#include "trace.h"
-+
-+#include <linux/prefetch.h>
-+#include <linux/sort.h>
-+
-+/*
-+ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
-+ * different snapshot:
-+ */
-+static struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
-+{
-+	struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
-+
-+	if (k.k && bpos_eq(path->pos, k.k->p))
-+		return k;
-+
-+	bkey_init(u);
-+	u->p = path->pos;
-+	return (struct bkey_s_c) { u, NULL };
-+}
-+
-+static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
-+{
-+#ifdef CONFIG_BCACHEFS_DEBUG
-+	struct bch_fs *c = trans->c;
-+	struct bkey u;
-+	struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
-+
-+	if (unlikely(trans->journal_replay_not_finished)) {
-+		struct bkey_i *j_k =
-+			bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
-+
-+		if (j_k)
-+			k = bkey_i_to_s_c(j_k);
-+	}
-+
-+	u = *k.k;
-+	u.needs_whiteout = i->old_k.needs_whiteout;
-+
-+	BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
-+	BUG_ON(i->old_v != k.v);
-+#endif
-+}
-+
-+static int __must_check
-+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
-+			  struct bkey_i *, enum btree_update_flags,
-+			  unsigned long ip);
-+
-+static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
-+					 const struct btree_insert_entry *r)
-+{
-+	return   cmp_int(l->btree_id,	r->btree_id) ?:
-+		 cmp_int(l->cached,	r->cached) ?:
-+		 -cmp_int(l->level,	r->level) ?:
-+		 bpos_cmp(l->k->k.p,	r->k->k.p);
-+}
-+
-+static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
-+{
-+	return i->path->l + i->level;
-+}
-+
-+static inline bool same_leaf_as_prev(struct btree_trans *trans,
-+				     struct btree_insert_entry *i)
-+{
-+	return i != trans->updates &&
-+		insert_l(&i[0])->b == insert_l(&i[-1])->b;
-+}
-+
-+static inline bool same_leaf_as_next(struct btree_trans *trans,
-+				     struct btree_insert_entry *i)
-+{
-+	return i + 1 < trans->updates + trans->nr_updates &&
-+		insert_l(&i[0])->b == insert_l(&i[1])->b;
-+}
-+
-+inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
-+					   struct btree_path *path,
-+					   struct btree *b)
-+{
-+	struct bch_fs *c = trans->c;
-+
-+	if (unlikely(btree_node_just_written(b)) &&
-+	    bch2_btree_post_write_cleanup(c, b))
-+		bch2_trans_node_reinit_iter(trans, b);
-+
-+	/*
-+	 * If the last bset has been written, or if it's gotten too big - start
-+	 * a new bset to insert into:
-+	 */
-+	if (want_new_bset(c, b))
-+		bch2_btree_init_next(trans, b);
-+}
-+
-+/* Inserting into a given leaf node (last stage of insert): */
-+
-+/* Handle overwrites and do insert, for non extents: */
-+bool bch2_btree_bset_insert_key(struct btree_trans *trans,
-+				struct btree_path *path,
-+				struct btree *b,
-+				struct btree_node_iter *node_iter,
-+				struct bkey_i *insert)
-+{
-+	struct bkey_packed *k;
-+	unsigned clobber_u64s = 0, new_u64s = 0;
-+
-+	EBUG_ON(btree_node_just_written(b));
-+	EBUG_ON(bset_written(b, btree_bset_last(b)));
-+	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
-+	EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
-+	EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
-+	EBUG_ON(insert->k.u64s >
-+		bch_btree_keys_u64s_remaining(trans->c, b));
-+
-+	k = bch2_btree_node_iter_peek_all(node_iter, b);
-+	if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
-+		k = NULL;
-+
-+	/* @k is the key being overwritten/deleted, if any: */
-+	EBUG_ON(k && bkey_deleted(k));
-+
-+	/* Deleting, but not found? nothing to do: */
-+	if (bkey_deleted(&insert->k) && !k)
-+		return false;
-+
-+	if (bkey_deleted(&insert->k)) {
-+		/* Deleting: */
-+		btree_account_key_drop(b, k);
-+		k->type = KEY_TYPE_deleted;
-+
-+		if (k->needs_whiteout)
-+			push_whiteout(trans->c, b, insert->k.p);
-+		k->needs_whiteout = false;
-+
-+		if (k >= btree_bset_last(b)->start) {
-+			clobber_u64s = k->u64s;
-+			bch2_bset_delete(b, k, clobber_u64s);
-+			goto fix_iter;
-+		} else {
-+			bch2_btree_path_fix_key_modified(trans, b, k);
-+		}
-+
-+		return true;
-+	}
-+
-+	if (k) {
-+		/* Overwriting: */
-+		btree_account_key_drop(b, k);
-+		k->type = KEY_TYPE_deleted;
-+
-+		insert->k.needs_whiteout = k->needs_whiteout;
-+		k->needs_whiteout = false;
-+
-+		if (k >= btree_bset_last(b)->start) {
-+			clobber_u64s = k->u64s;
-+			goto overwrite;
-+		} else {
-+			bch2_btree_path_fix_key_modified(trans, b, k);
-+		}
-+	}
-+
-+	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
-+overwrite:
-+	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
-+	new_u64s = k->u64s;
-+fix_iter:
-+	if (clobber_u64s != new_u64s)
-+		bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
-+					 clobber_u64s, new_u64s);
-+	return true;
-+}
-+
-+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
-+			       unsigned i, u64 seq)
-+{
-+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-+	struct btree_write *w = container_of(pin, struct btree_write, journal);
-+	struct btree *b = container_of(w, struct btree, writes[i]);
-+	struct btree_trans trans;
-+	unsigned long old, new, v;
-+	unsigned idx = w - b->writes;
-+
-+	bch2_trans_init(&trans, c, 0, 0);
-+
-+	btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
-+	v = READ_ONCE(b->flags);
-+
-+	do {
-+		old = new = v;
-+
-+		if (!(old & (1 << BTREE_NODE_dirty)) ||
-+		    !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
-+		    w->journal.seq != seq)
-+			break;
-+
-+		new &= ~BTREE_WRITE_TYPE_MASK;
-+		new |= BTREE_WRITE_journal_reclaim;
-+		new |= 1 << BTREE_NODE_need_write;
-+	} while ((v = cmpxchg(&b->flags, old, new)) != old);
-+
-+	btree_node_write_if_need(c, b, SIX_LOCK_read);
-+	six_unlock_read(&b->c.lock);
-+
-+	bch2_trans_exit(&trans);
-+	return 0;
-+}
-+
-+int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-+{
-+	return __btree_node_flush(j, pin, 0, seq);
-+}
-+
-+int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-+{
-+	return __btree_node_flush(j, pin, 1, seq);
-+}
-+
-+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
-+				       struct btree *b, u64 seq)
-+{
-+	struct btree_write *w = btree_current_write(b);
-+
-+	bch2_journal_pin_add(&c->journal, seq, &w->journal,
-+			     btree_node_write_idx(b) == 0
-+			     ? bch2_btree_node_flush0
-+			     : bch2_btree_node_flush1);
-+}
-+
-+/**
-+ * btree_insert_key - insert a key one key into a leaf node
-+ */
-+inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
-+				       struct btree_path *path,
-+				       struct bkey_i *insert,
-+				       u64 journal_seq)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree *b = path_l(path)->b;
-+	struct bset_tree *t = bset_tree_last(b);
-+	struct bset *i = bset(b, t);
-+	int old_u64s = bset_u64s(t);
-+	int old_live_u64s = b->nr.live_u64s;
-+	int live_u64s_added, u64s_added;
-+
-+	if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
-+					&path_l(path)->iter, insert)))
-+		return;
-+
-+	i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
-+
-+	bch2_btree_add_journal_pin(c, b, journal_seq);
-+
-+	if (unlikely(!btree_node_dirty(b))) {
-+		EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
-+		set_btree_node_dirty_acct(c, b);
-+	}
-+
-+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
-+	u64s_added = (int) bset_u64s(t) - old_u64s;
-+
-+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
-+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
-+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
-+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-+
-+	if (u64s_added > live_u64s_added &&
-+	    bch2_maybe_compact_whiteouts(c, b))
-+		bch2_trans_node_reinit_iter(trans, b);
-+}
-+
-+/* Cached btree updates: */
-+
-+/* Normal update interface: */
-+
-+static inline void btree_insert_entry_checks(struct btree_trans *trans,
-+					     struct btree_insert_entry *i)
-+{
-+	BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
-+	BUG_ON(i->cached	!= i->path->cached);
-+	BUG_ON(i->level		!= i->path->level);
-+	BUG_ON(i->btree_id	!= i->path->btree_id);
-+	EBUG_ON(!i->level &&
-+		!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
-+		test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
-+		i->k->k.p.snapshot &&
-+		bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
-+}
-+
-+static noinline int
-+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
-+				   unsigned long trace_ip)
-+{
-+	return drop_locks_do(trans,
-+		bch2_journal_preres_get(&trans->c->journal,
-+			&trans->journal_preres,
-+			trans->journal_preres_u64s,
-+			(flags & BCH_WATERMARK_MASK)));
-+}
-+
-+static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
-+						      unsigned flags)
-+{
-+	return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
-+				    trans->journal_u64s, flags);
-+}
-+
-+#define JSET_ENTRY_LOG_U64s		4
-+
-+static noinline void journal_transaction_name(struct btree_trans *trans)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct journal *j = &c->journal;
-+	struct jset_entry *entry =
-+		bch2_journal_add_entry(j, &trans->journal_res,
-+				       BCH_JSET_ENTRY_log, 0, 0,
-+				       JSET_ENTRY_LOG_U64s);
-+	struct jset_entry_log *l =
-+		container_of(entry, struct jset_entry_log, entry);
-+
-+	strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
-+}
-+
-+static inline int btree_key_can_insert(struct btree_trans *trans,
-+				       struct btree *b, unsigned u64s)
-+{
-+	struct bch_fs *c = trans->c;
-+
-+	if (!bch2_btree_node_insert_fits(c, b, u64s))
-+		return -BCH_ERR_btree_insert_btree_node_full;
-+
-+	return 0;
-+}
-+
-+static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
-+				       struct btree_path *path, unsigned u64s)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct bkey_cached *ck = (void *) path->l[0].b;
-+	struct btree_insert_entry *i;
-+	unsigned new_u64s;
-+	struct bkey_i *new_k;
-+
-+	EBUG_ON(path->level);
-+
-+	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-+	    bch2_btree_key_cache_must_wait(c) &&
-+	    !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
-+		return -BCH_ERR_btree_insert_need_journal_reclaim;
-+
-+	/*
-+	 * bch2_varint_decode can read past the end of the buffer by at most 7
-+	 * bytes (it won't be used):
-+	 */
-+	u64s += 1;
-+
-+	if (u64s <= ck->u64s)
-+		return 0;
-+
-+	new_u64s	= roundup_pow_of_two(u64s);
-+	new_k		= krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
-+	if (!new_k) {
-+		bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
-+			bch2_btree_ids[path->btree_id], new_u64s);
-+		return -BCH_ERR_ENOMEM_btree_key_cache_insert;
-+	}
-+
-+	trans_for_each_update(trans, i)
-+		if (i->old_v == &ck->k->v)
-+			i->old_v = &new_k->v;
-+
-+	ck->u64s	= new_u64s;
-+	ck->k		= new_k;
-+	return 0;
-+}
-+
-+/* Triggers: */
-+
-+static int run_one_mem_trigger(struct btree_trans *trans,
-+			       struct btree_insert_entry *i,
-+			       unsigned flags)
-+{
-+	struct bkey_s_c old = { &i->old_k, i->old_v };
-+	struct bkey_i *new = i->k;
-+	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
-+	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-+	int ret;
-+
-+	verify_update_old_key(trans, i);
-+
-+	if (unlikely(flags & BTREE_TRIGGER_NORUN))
-+		return 0;
-+
-+	if (!btree_node_type_needs_gc(i->btree_id))
-+		return 0;
-+
-+	if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
-+	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-+		ret   = bch2_mark_key(trans, i->btree_id, i->level,
-+				old, bkey_i_to_s_c(new),
-+				BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
-+	} else {
-+		struct bkey		_deleted = KEY(0, 0, 0);
-+		struct bkey_s_c		deleted = (struct bkey_s_c) { &_deleted, NULL };
-+
-+		_deleted.p = i->path->pos;
-+
-+		ret   = bch2_mark_key(trans, i->btree_id, i->level,
-+				deleted, bkey_i_to_s_c(new),
-+				BTREE_TRIGGER_INSERT|flags) ?:
-+			bch2_mark_key(trans, i->btree_id, i->level,
-+				old, deleted,
-+				BTREE_TRIGGER_OVERWRITE|flags);
-+	}
-+
-+	return ret;
-+}
-+
-+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
-+				 bool overwrite)
-+{
-+	/*
-+	 * Transactional triggers create new btree_insert_entries, so we can't
-+	 * pass them a pointer to a btree_insert_entry, that memory is going to
-+	 * move:
-+	 */
-+	struct bkey old_k = i->old_k;
-+	struct bkey_s_c old = { &old_k, i->old_v };
-+	const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
-+	const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-+
-+	verify_update_old_key(trans, i);
-+
-+	if ((i->flags & BTREE_TRIGGER_NORUN) ||
-+	    !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
-+		return 0;
-+
-+	if (!i->insert_trigger_run &&
-+	    !i->overwrite_trigger_run &&
-+	    old_ops->trans_trigger == new_ops->trans_trigger &&
-+	    ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-+		i->overwrite_trigger_run = true;
-+		i->insert_trigger_run = true;
-+		return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
-+					   BTREE_TRIGGER_INSERT|
-+					   BTREE_TRIGGER_OVERWRITE|
-+					   i->flags) ?: 1;
-+	} else if (overwrite && !i->overwrite_trigger_run) {
-+		i->overwrite_trigger_run = true;
-+		return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
-+	} else if (!overwrite && !i->insert_trigger_run) {
-+		i->insert_trigger_run = true;
-+		return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
-+	} else {
-+		return 0;
-+	}
-+}
-+
-+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
-+			      struct btree_insert_entry *btree_id_start)
-+{
-+	struct btree_insert_entry *i;
-+	bool trans_trigger_run;
-+	int ret, overwrite;
-+
-+	for (overwrite = 1; overwrite >= 0; --overwrite) {
-+
-+		/*
-+		 * Running triggers will append more updates to the list of updates as
-+		 * we're walking it:
-+		 */
-+		do {
-+			trans_trigger_run = false;
-+
-+			for (i = btree_id_start;
-+			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
-+			     i++) {
-+				if (i->btree_id != btree_id)
-+					continue;
-+
-+				ret = run_one_trans_trigger(trans, i, overwrite);
-+				if (ret < 0)
-+					return ret;
-+				if (ret)
-+					trans_trigger_run = true;
-+			}
-+		} while (trans_trigger_run);
-+	}
-+
-+	return 0;
-+}
-+
-+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-+{
-+	struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
-+	unsigned btree_id = 0;
-+	int ret = 0;
-+
-+	/*
-+	 *
-+	 * For a given btree, this algorithm runs insert triggers before
-+	 * overwrite triggers: this is so that when extents are being moved
-+	 * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
-+	 * they are re-added.
-+	 */
-+	for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
-+		if (btree_id == BTREE_ID_alloc)
-+			continue;
-+
-+		while (btree_id_start < trans->updates + trans->nr_updates &&
-+		       btree_id_start->btree_id < btree_id)
-+			btree_id_start++;
-+
-+		ret = run_btree_triggers(trans, btree_id, btree_id_start);
-+		if (ret)
-+			return ret;
-+	}
-+
-+	trans_for_each_update(trans, i) {
-+		if (i->btree_id > BTREE_ID_alloc)
-+			break;
-+		if (i->btree_id == BTREE_ID_alloc) {
-+			ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
-+			if (ret)
-+				return ret;
-+			break;
-+		}
-+	}
-+
-+#ifdef CONFIG_BCACHEFS_DEBUG
-+	trans_for_each_update(trans, i)
-+		BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
-+		       (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
-+		       (!i->insert_trigger_run || !i->overwrite_trigger_run));
-+#endif
-+	return 0;
-+}
-+
-+static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_insert_entry *i;
-+	int ret = 0;
-+
-+	trans_for_each_update(trans, i) {
-+		/*
-+		 * XXX: synchronization of cached update triggers with gc
-+		 * XXX: synchronization of interior node updates with gc
-+		 */
-+		BUG_ON(i->cached || i->level);
-+
-+		if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
-+			ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
-+			if (ret)
-+				break;
-+		}
-+	}
-+
-+	return ret;
-+}
-+
-+static inline int
-+bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
-+			       struct btree_insert_entry **stopped_at,
-+			       unsigned long trace_ip)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_insert_entry *i;
-+	struct btree_write_buffered_key *wb;
-+	struct btree_trans_commit_hook *h;
-+	unsigned u64s = 0;
-+	bool marking = false;
-+	int ret;
-+
-+	if (race_fault()) {
-+		trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
-+		return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
-+	}
-+
-+	/*
-+	 * Check if the insert will fit in the leaf node with the write lock
-+	 * held, otherwise another thread could write the node changing the
-+	 * amount of space available:
-+	 */
-+
-+	prefetch(&trans->c->journal.flags);
-+
-+	trans_for_each_update(trans, i) {
-+		/* Multiple inserts might go to same leaf: */
-+		if (!same_leaf_as_prev(trans, i))
-+			u64s = 0;
-+
-+		u64s += i->k->k.u64s;
-+		ret = !i->cached
-+			? btree_key_can_insert(trans, insert_l(i)->b, u64s)
-+			: btree_key_can_insert_cached(trans, flags, i->path, u64s);
-+		if (ret) {
-+			*stopped_at = i;
-+			return ret;
-+		}
-+
-+		if (btree_node_type_needs_gc(i->bkey_type))
-+			marking = true;
-+	}
-+
-+	if (trans->nr_wb_updates &&
-+	    trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
-+		return -BCH_ERR_btree_insert_need_flush_buffer;
-+
-+	/*
-+	 * Don't get journal reservation until after we know insert will
-+	 * succeed:
-+	 */
-+	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-+		ret = bch2_trans_journal_res_get(trans,
-+				(flags & BCH_WATERMARK_MASK)|
-+				JOURNAL_RES_GET_NONBLOCK);
-+		if (ret)
-+			return ret;
-+
-+		if (unlikely(trans->journal_transaction_names))
-+			journal_transaction_name(trans);
-+	} else {
-+		trans->journal_res.seq = c->journal.replay_journal_seq;
-+	}
-+
-+	/*
-+	 * Not allowed to fail after we've gotten our journal reservation - we
-+	 * have to use it:
-+	 */
-+
-+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-+	    !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
-+		if (bch2_journal_seq_verify)
-+			trans_for_each_update(trans, i)
-+				i->k->k.version.lo = trans->journal_res.seq;
-+		else if (bch2_inject_invalid_keys)
-+			trans_for_each_update(trans, i)
-+				i->k->k.version = MAX_VERSION;
-+	}
-+
-+	if (trans->fs_usage_deltas &&
-+	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
-+		return -BCH_ERR_btree_insert_need_mark_replicas;
-+
-+	if (trans->nr_wb_updates) {
-+		EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
-+
-+		ret = bch2_btree_insert_keys_write_buffer(trans);
-+		if (ret)
-+			goto revert_fs_usage;
-+	}
-+
-+	h = trans->hooks;
-+	while (h) {
-+		ret = h->fn(trans, h);
-+		if (ret)
-+			goto revert_fs_usage;
-+		h = h->next;
-+	}
-+
-+	trans_for_each_update(trans, i)
-+		if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
-+			ret = run_one_mem_trigger(trans, i, i->flags);
-+			if (ret)
-+				goto fatal_err;
-+		}
-+
-+	if (unlikely(c->gc_pos.phase)) {
-+		ret = bch2_trans_commit_run_gc_triggers(trans);
-+		if  (ret)
-+			goto fatal_err;
-+	}
-+
-+	if (unlikely(trans->extra_journal_entries.nr)) {
-+		memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
-+				  trans->extra_journal_entries.data,
-+				  trans->extra_journal_entries.nr);
-+
-+		trans->journal_res.offset	+= trans->extra_journal_entries.nr;
-+		trans->journal_res.u64s		-= trans->extra_journal_entries.nr;
-+	}
-+
-+	if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-+		struct journal *j = &c->journal;
-+		struct jset_entry *entry;
-+
-+		trans_for_each_update(trans, i) {
-+			if (i->key_cache_already_flushed)
-+				continue;
-+
-+			if (i->flags & BTREE_UPDATE_NOJOURNAL)
-+				continue;
-+
-+			verify_update_old_key(trans, i);
-+
-+			if (trans->journal_transaction_names) {
-+				entry = bch2_journal_add_entry(j, &trans->journal_res,
-+						       BCH_JSET_ENTRY_overwrite,
-+						       i->btree_id, i->level,
-+						       i->old_k.u64s);
-+				bkey_reassemble(&entry->start[0],
-+						(struct bkey_s_c) { &i->old_k, i->old_v });
-+			}
-+
-+			entry = bch2_journal_add_entry(j, &trans->journal_res,
-+					       BCH_JSET_ENTRY_btree_keys,
-+					       i->btree_id, i->level,
-+					       i->k->k.u64s);
-+			bkey_copy(&entry->start[0], i->k);
-+		}
-+
-+		trans_for_each_wb_update(trans, wb) {
-+			entry = bch2_journal_add_entry(j, &trans->journal_res,
-+					       BCH_JSET_ENTRY_btree_keys,
-+					       wb->btree, 0,
-+					       wb->k.k.u64s);
-+			bkey_copy(&entry->start[0], &wb->k);
-+		}
-+
-+		if (trans->journal_seq)
-+			*trans->journal_seq = trans->journal_res.seq;
-+	}
-+
-+	trans_for_each_update(trans, i) {
-+		i->k->k.needs_whiteout = false;
-+
-+		if (!i->cached) {
-+			u64 seq = trans->journal_res.seq;
-+
-+			if (i->flags & BTREE_UPDATE_PREJOURNAL)
-+				seq = i->seq;
-+
-+			bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
-+		} else if (!i->key_cache_already_flushed)
-+			bch2_btree_insert_key_cached(trans, flags, i);
-+		else {
-+			bch2_btree_key_cache_drop(trans, i->path);
-+			btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
-+		}
-+	}
-+
-+	return 0;
-+fatal_err:
-+	bch2_fatal_error(c);
-+revert_fs_usage:
-+	if (trans->fs_usage_deltas)
-+		bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
-+	return ret;
-+}
-+
-+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-+{
-+	while (--i >= trans->updates) {
-+		if (same_leaf_as_prev(trans, i))
-+			continue;
-+
-+		bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
-+	}
-+
-+	trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
-+	return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-+}
-+
-+static inline int trans_lock_write(struct btree_trans *trans)
-+{
-+	struct btree_insert_entry *i;
-+
-+	trans_for_each_update(trans, i) {
-+		if (same_leaf_as_prev(trans, i))
-+			continue;
-+
-+		if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
-+			return trans_lock_write_fail(trans, i);
-+
-+		if (!i->cached)
-+			bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
-+	}
-+
-+	return 0;
-+}
-+
-+static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
-+{
-+	struct btree_insert_entry *i;
-+	struct btree_write_buffered_key *wb;
-+
-+	trans_for_each_update(trans, i)
-+		bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
-+
-+	trans_for_each_wb_update(trans, wb)
-+		bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
-+}
-+
-+#ifdef CONFIG_BCACHEFS_DEBUG
-+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
-+						   struct btree_insert_entry *i,
-+						   struct printbuf *err)
-+{
-+	struct bch_fs *c = trans->c;
-+	int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
-+
-+	printbuf_reset(err);
-+	prt_printf(err, "invalid bkey on insert from %s -> %ps",
-+		   trans->fn, (void *) i->ip_allocated);
-+	prt_newline(err);
-+	printbuf_indent_add(err, 2);
-+
-+	bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
-+	prt_newline(err);
-+
-+	bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-+			  i->bkey_type, rw, err);
-+	bch2_print_string_as_lines(KERN_ERR, err->buf);
-+
-+	bch2_inconsistent_error(c);
-+	bch2_dump_trans_updates(trans);
-+	printbuf_exit(err);
-+
-+	return -EINVAL;
-+}
-+#endif
-+
-+/*
-+ * Get journal reservation, take write locks, and attempt to do btree update(s):
-+ */
-+static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
-+				       struct btree_insert_entry **stopped_at,
-+				       unsigned long trace_ip)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_insert_entry *i;
-+	int ret = 0, u64s_delta = 0;
-+
-+#ifdef CONFIG_BCACHEFS_DEBUG
-+	trans_for_each_update(trans, i) {
-+		struct printbuf buf = PRINTBUF;
-+		enum bkey_invalid_flags invalid_flags = 0;
-+
-+		if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
-+			invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
-+
-+		if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-+					       i->bkey_type, invalid_flags, &buf)))
-+			ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
-+		btree_insert_entry_checks(trans, i);
-+		printbuf_exit(&buf);
-+
-+		if (ret)
-+			return ret;
-+	}
-+#endif
-+
-+	trans_for_each_update(trans, i) {
-+		if (i->cached)
-+			continue;
-+
-+		u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
-+		u64s_delta -= i->old_btree_u64s;
-+
-+		if (!same_leaf_as_next(trans, i)) {
-+			if (u64s_delta <= 0) {
-+				ret = bch2_foreground_maybe_merge(trans, i->path,
-+							i->level, flags);
-+				if (unlikely(ret))
-+					return ret;
-+			}
-+
-+			u64s_delta = 0;
-+		}
-+	}
-+
-+	ret = bch2_journal_preres_get(&c->journal,
-+			&trans->journal_preres, trans->journal_preres_u64s,
-+			(flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
-+	if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
-+		ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
-+	if (unlikely(ret))
-+		return ret;
-+
-+	ret = trans_lock_write(trans);
-+	if (unlikely(ret))
-+		return ret;
-+
-+	ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
-+
-+	if (!ret && unlikely(trans->journal_replay_not_finished))
-+		bch2_drop_overwrites_from_journal(trans);
-+
-+	trans_for_each_update(trans, i)
-+		if (!same_leaf_as_prev(trans, i))
-+			bch2_btree_node_unlock_write_inlined(trans, i->path,
-+							insert_l(i)->b);
-+
-+	if (!ret && trans->journal_pin)
-+		bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
-+				     trans->journal_pin, NULL);
-+
-+	/*
-+	 * Drop journal reservation after dropping write locks, since dropping
-+	 * the journal reservation may kick off a journal write:
-+	 */
-+	bch2_journal_res_put(&c->journal, &trans->journal_res);
-+
-+	if (unlikely(ret))
-+		return ret;
-+
-+	bch2_trans_downgrade(trans);
-+
-+	return 0;
-+}
-+
-+static int journal_reclaim_wait_done(struct bch_fs *c)
-+{
-+	int ret = bch2_journal_error(&c->journal) ?:
-+		!bch2_btree_key_cache_must_wait(c);
-+
-+	if (!ret)
-+		journal_reclaim_kick(&c->journal);
-+	return ret;
-+}
-+
-+static noinline
-+int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
-+			    struct btree_insert_entry *i,
-+			    int ret, unsigned long trace_ip)
-+{
-+	struct bch_fs *c = trans->c;
-+
-+	switch (ret) {
-+	case -BCH_ERR_btree_insert_btree_node_full:
-+		ret = bch2_btree_split_leaf(trans, i->path, flags);
-+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-+			trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
-+		break;
-+	case -BCH_ERR_btree_insert_need_mark_replicas:
-+		ret = drop_locks_do(trans,
-+			bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
-+		break;
-+	case -BCH_ERR_journal_res_get_blocked:
-+		/*
-+		 * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
-+		 * flag
-+		 */
-+		if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
-+		    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
-+			ret = -BCH_ERR_journal_reclaim_would_deadlock;
-+			break;
-+		}
-+
-+		ret = drop_locks_do(trans,
-+			bch2_trans_journal_res_get(trans,
-+					(flags & BCH_WATERMARK_MASK)|
-+					JOURNAL_RES_GET_CHECK));
-+		break;
-+	case -BCH_ERR_btree_insert_need_journal_reclaim:
-+		bch2_trans_unlock(trans);
-+
-+		trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
-+
-+		wait_event_freezable(c->journal.reclaim_wait,
-+				     (ret = journal_reclaim_wait_done(c)));
-+		if (ret < 0)
-+			break;
-+
-+		ret = bch2_trans_relock(trans);
-+		break;
-+	case -BCH_ERR_btree_insert_need_flush_buffer: {
-+		struct btree_write_buffer *wb = &c->btree_write_buffer;
-+
-+		ret = 0;
-+
-+		if (wb->state.nr > wb->size * 3 / 4) {
-+			bch2_trans_unlock(trans);
-+			mutex_lock(&wb->flush_lock);
-+
-+			if (wb->state.nr > wb->size * 3 / 4) {
-+				bch2_trans_begin(trans);
-+				ret = __bch2_btree_write_buffer_flush(trans,
-+						flags|BTREE_INSERT_NOCHECK_RW, true);
-+				if (!ret) {
-+					trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
-+					ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
-+				}
-+			} else {
-+				mutex_unlock(&wb->flush_lock);
-+				ret = bch2_trans_relock(trans);
-+			}
-+		}
-+		break;
-+	}
-+	default:
-+		BUG_ON(ret >= 0);
-+		break;
-+	}
-+
-+	BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
-+
-+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
-+				!(flags & BTREE_INSERT_NOWAIT) &&
-+				(flags & BTREE_INSERT_NOFAIL), c,
-+		"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
-+
-+	return ret;
-+}
-+
-+static noinline int
-+bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
-+{
-+	struct bch_fs *c = trans->c;
-+	int ret;
-+
-+	if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
-+	    test_bit(BCH_FS_STARTED, &c->flags))
-+		return -BCH_ERR_erofs_trans_commit;
-+
-+	ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
-+	if (ret)
-+		return ret;
-+
-+	bch2_write_ref_get(c, BCH_WRITE_REF_trans);
-+	return 0;
-+}
-+
-+/*
-+ * This is for updates done in the early part of fsck - btree_gc - before we've
-+ * gone RW. we only add the new key to the list of keys for journal replay to
-+ * do.
-+ */
-+static noinline int
-+do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_insert_entry *i;
-+	int ret = 0;
-+
-+	trans_for_each_update(trans, i) {
-+		ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
-+		if (ret)
-+			break;
-+	}
-+
-+	return ret;
-+}
-+
-+int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_insert_entry *i = NULL;
-+	struct btree_write_buffered_key *wb;
-+	unsigned u64s;
-+	int ret = 0;
-+
-+	if (!trans->nr_updates &&
-+	    !trans->nr_wb_updates &&
-+	    !trans->extra_journal_entries.nr)
-+		goto out_reset;
-+
-+	if (flags & BTREE_INSERT_GC_LOCK_HELD)
-+		lockdep_assert_held(&c->gc_lock);
-+
-+	ret = bch2_trans_commit_run_triggers(trans);
-+	if (ret)
-+		goto out_reset;
-+
-+	if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
-+		ret = do_bch2_trans_commit_to_journal_replay(trans);
-+		goto out_reset;
-+	}
-+
-+	if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
-+	    unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
-+		ret = bch2_trans_commit_get_rw_cold(trans, flags);
-+		if (ret)
-+			goto out_reset;
-+	}
-+
-+	if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
-+	    mutex_trylock(&c->btree_write_buffer.flush_lock)) {
-+		bch2_trans_begin(trans);
-+		bch2_trans_unlock(trans);
-+
-+		ret = __bch2_btree_write_buffer_flush(trans,
-+					flags|BTREE_INSERT_NOCHECK_RW, true);
-+		if (!ret) {
-+			trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
-+			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
-+		}
-+		goto out;
-+	}
-+
-+	EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
-+
-+	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-+
-+	trans->journal_u64s		= trans->extra_journal_entries.nr;
-+	trans->journal_preres_u64s	= 0;
-+
-+	trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
-+
-+	if (trans->journal_transaction_names)
-+		trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
-+
-+	trans_for_each_update(trans, i) {
-+		EBUG_ON(!i->path->should_be_locked);
-+
-+		ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
-+		if (unlikely(ret))
-+			goto out;
-+
-+		EBUG_ON(!btree_node_intent_locked(i->path, i->level));
-+
-+		if (i->key_cache_already_flushed)
-+			continue;
-+
-+		/* we're going to journal the key being updated: */
-+		u64s = jset_u64s(i->k->k.u64s);
-+		if (i->cached &&
-+		    likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
-+			trans->journal_preres_u64s += u64s;
-+
-+		if (i->flags & BTREE_UPDATE_NOJOURNAL)
-+			continue;
-+
-+		trans->journal_u64s += u64s;
-+
-+		/* and we're also going to log the overwrite: */
-+		if (trans->journal_transaction_names)
-+			trans->journal_u64s += jset_u64s(i->old_k.u64s);
-+	}
-+
-+	trans_for_each_wb_update(trans, wb)
-+		trans->journal_u64s += jset_u64s(wb->k.k.u64s);
-+
-+	if (trans->extra_journal_res) {
-+		ret = bch2_disk_reservation_add(c, trans->disk_res,
-+				trans->extra_journal_res,
-+				(flags & BTREE_INSERT_NOFAIL)
-+				? BCH_DISK_RESERVATION_NOFAIL : 0);
-+		if (ret)
-+			goto err;
-+	}
-+retry:
-+	bch2_trans_verify_not_in_restart(trans);
-+	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
-+
-+	ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
-+
-+	/* make sure we didn't drop or screw up locks: */
-+	bch2_trans_verify_locks(trans);
-+
-+	if (ret)
-+		goto err;
-+
-+	trace_and_count(c, transaction_commit, trans, _RET_IP_);
-+out:
-+	bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-+
-+	if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
-+		bch2_write_ref_put(c, BCH_WRITE_REF_trans);
-+out_reset:
-+	bch2_trans_reset_updates(trans);
-+
-+	return ret;
-+err:
-+	ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
-+	if (ret)
-+		goto out;
-+
-+	goto retry;
-+}
-+
-+static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
-+					  enum btree_id id,
-+					  struct bpos pos)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	int ret;
-+
-+	bch2_trans_iter_init(trans, &iter, id, pos,
-+			     BTREE_ITER_NOT_EXTENTS|
-+			     BTREE_ITER_ALL_SNAPSHOTS);
-+	while (1) {
-+		k = bch2_btree_iter_prev(&iter);
-+		ret = bkey_err(k);
-+		if (ret)
-+			break;
-+
-+		if (!k.k)
-+			break;
-+
-+		if (!bkey_eq(pos, k.k->p))
-+			break;
-+
-+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
-+			ret = 1;
-+			break;
-+		}
-+	}
-+	bch2_trans_iter_exit(trans, &iter);
-+
-+	return ret;
-+}
-+
-+static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
-+					  enum btree_id id,
-+					  struct bpos pos)
-+{
-+	if (!btree_type_has_snapshots(id) ||
-+	    bch2_snapshot_is_leaf(trans->c, pos.snapshot))
-+		return 0;
-+
-+	return __check_pos_snapshot_overwritten(trans, id, pos);
-+}
-+
-+static noinline int extent_front_merge(struct btree_trans *trans,
-+				       struct btree_iter *iter,
-+				       struct bkey_s_c k,
-+				       struct bkey_i **insert,
-+				       enum btree_update_flags flags)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct bkey_i *update;
-+	int ret;
-+
-+	update = bch2_bkey_make_mut_noupdate(trans, k);
-+	ret = PTR_ERR_OR_ZERO(update);
-+	if (ret)
-+		return ret;
-+
-+	if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
-+		return 0;
-+
-+	ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?:
-+		check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p);
-+	if (ret < 0)
-+		return ret;
-+	if (ret)
-+		return 0;
-+
-+	ret = bch2_btree_delete_at(trans, iter, flags);
-+	if (ret)
-+		return ret;
-+
-+	*insert = update;
-+	return 0;
-+}
-+
-+static noinline int extent_back_merge(struct btree_trans *trans,
-+				      struct btree_iter *iter,
-+				      struct bkey_i *insert,
-+				      struct bkey_s_c k)
-+{
-+	struct bch_fs *c = trans->c;
-+	int ret;
-+
-+	ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?:
-+		check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p);
-+	if (ret < 0)
-+		return ret;
-+	if (ret)
-+		return 0;
-+
-+	bch2_bkey_merge(c, bkey_i_to_s(insert), k);
-+	return 0;
-+}
-+
-+/*
-+ * When deleting, check if we need to emit a whiteout (because we're overwriting
-+ * something in an ancestor snapshot)
-+ */
-+static int need_whiteout_for_snapshot(struct btree_trans *trans,
-+				      enum btree_id btree_id, struct bpos pos)
-+{
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	u32 snapshot = pos.snapshot;
-+	int ret;
-+
-+	if (!bch2_snapshot_parent(trans->c, pos.snapshot))
-+		return 0;
-+
-+	pos.snapshot++;
-+
-+	for_each_btree_key_norestart(trans, iter, btree_id, pos,
-+			   BTREE_ITER_ALL_SNAPSHOTS|
-+			   BTREE_ITER_NOPRESERVE, k, ret) {
-+		if (!bkey_eq(k.k->p, pos))
-+			break;
-+
-+		if (bch2_snapshot_is_ancestor(trans->c, snapshot,
-+					      k.k->p.snapshot)) {
-+			ret = !bkey_whiteout(k.k);
-+			break;
-+		}
-+	}
-+	bch2_trans_iter_exit(trans, &iter);
-+
-+	return ret;
-+}
-+
-+int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
-+				   enum btree_id id,
-+				   struct bpos old_pos,
-+				   struct bpos new_pos)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_iter old_iter, new_iter = { NULL };
-+	struct bkey_s_c old_k, new_k;
-+	snapshot_id_list s;
-+	struct bkey_i *update;
-+	int ret;
-+
-+	if (!bch2_snapshot_has_children(c, old_pos.snapshot))
-+		return 0;
-+
-+	darray_init(&s);
-+
-+	bch2_trans_iter_init(trans, &old_iter, id, old_pos,
-+			     BTREE_ITER_NOT_EXTENTS|
-+			     BTREE_ITER_ALL_SNAPSHOTS);
-+	while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
-+	       !(ret = bkey_err(old_k)) &&
-+	       bkey_eq(old_pos, old_k.k->p)) {
-+		struct bpos whiteout_pos =
-+			SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
-+
-+		if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
-+		    snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
-+			continue;
-+
-+		new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
-+					   BTREE_ITER_NOT_EXTENTS|
-+					   BTREE_ITER_INTENT);
-+		ret = bkey_err(new_k);
-+		if (ret)
-+			break;
-+
-+		if (new_k.k->type == KEY_TYPE_deleted) {
-+			update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-+			ret = PTR_ERR_OR_ZERO(update);
-+			if (ret)
-+				break;
-+
-+			bkey_init(&update->k);
-+			update->k.p		= whiteout_pos;
-+			update->k.type		= KEY_TYPE_whiteout;
-+
-+			ret = bch2_trans_update(trans, &new_iter, update,
-+						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-+		}
-+		bch2_trans_iter_exit(trans, &new_iter);
-+
-+		ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
-+		if (ret)
-+			break;
-+	}
-+	bch2_trans_iter_exit(trans, &new_iter);
-+	bch2_trans_iter_exit(trans, &old_iter);
-+	darray_exit(&s);
-+
-+	return ret;
-+}
-+
-+int bch2_trans_update_extent(struct btree_trans *trans,
-+			     struct btree_iter *orig_iter,
-+			     struct bkey_i *insert,
-+			     enum btree_update_flags flags)
-+{
-+	struct btree_iter iter;
-+	struct bpos start = bkey_start_pos(&insert->k);
-+	struct bkey_i *update;
-+	struct bkey_s_c k;
-+	enum btree_id btree_id = orig_iter->btree_id;
-+	int ret = 0, compressed_sectors;
-+
-+	bch2_trans_iter_init(trans, &iter, btree_id, start,
-+			     BTREE_ITER_INTENT|
-+			     BTREE_ITER_WITH_UPDATES|
-+			     BTREE_ITER_NOT_EXTENTS);
-+	k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
-+	if ((ret = bkey_err(k)))
-+		goto err;
-+	if (!k.k)
-+		goto out;
-+
-+	if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
-+		if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
-+			ret = extent_front_merge(trans, &iter, k, &insert, flags);
-+			if (ret)
-+				goto err;
-+		}
-+
-+		goto next;
-+	}
-+
-+	while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
-+		bool front_split = bkey_lt(bkey_start_pos(k.k), start);
-+		bool back_split  = bkey_gt(k.k->p, insert->k.p);
-+
-+		/*
-+		 * If we're going to be splitting a compressed extent, note it
-+		 * so that __bch2_trans_commit() can increase our disk
-+		 * reservation:
-+		 */
-+		if (((front_split && back_split) ||
-+		     ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) &&
-+		    (compressed_sectors = bch2_bkey_sectors_compressed(k)))
-+			trans->extra_journal_res += compressed_sectors;
-+
-+		if (front_split) {
-+			update = bch2_bkey_make_mut_noupdate(trans, k);
-+			if ((ret = PTR_ERR_OR_ZERO(update)))
-+				goto err;
-+
-+			bch2_cut_back(start, update);
-+
-+			ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-+						k.k->p, update->k.p) ?:
-+				bch2_btree_insert_nonextent(trans, btree_id, update,
-+						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-+			if (ret)
-+				goto err;
-+		}
-+
-+		if (k.k->p.snapshot != insert->k.p.snapshot &&
-+		    (front_split || back_split)) {
-+			update = bch2_bkey_make_mut_noupdate(trans, k);
-+			if ((ret = PTR_ERR_OR_ZERO(update)))
-+				goto err;
-+
-+			bch2_cut_front(start, update);
-+			bch2_cut_back(insert->k.p, update);
-+
-+			ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-+						k.k->p, update->k.p) ?:
-+				bch2_btree_insert_nonextent(trans, btree_id, update,
-+						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-+			if (ret)
-+				goto err;
-+		}
-+
-+		if (bkey_le(k.k->p, insert->k.p)) {
-+			update = bch2_trans_kmalloc(trans, sizeof(*update));
-+			if ((ret = PTR_ERR_OR_ZERO(update)))
-+				goto err;
-+
-+			bkey_init(&update->k);
-+			update->k.p = k.k->p;
-+			update->k.p.snapshot = insert->k.p.snapshot;
-+
-+			if (insert->k.p.snapshot != k.k->p.snapshot) {
-+				update->k.type = KEY_TYPE_whiteout;
-+			} else if (btree_type_has_snapshots(btree_id)) {
-+				ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
-+				if (ret < 0)
-+					goto err;
-+				if (ret)
-+					update->k.type = KEY_TYPE_whiteout;
-+			}
-+
-+			ret = bch2_btree_insert_nonextent(trans, btree_id, update,
-+						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-+			if (ret)
-+				goto err;
-+		}
-+
-+		if (back_split) {
-+			update = bch2_bkey_make_mut_noupdate(trans, k);
-+			if ((ret = PTR_ERR_OR_ZERO(update)))
-+				goto err;
-+
-+			bch2_cut_front(insert->k.p, update);
-+
-+			ret = bch2_trans_update_by_path(trans, iter.path, update,
-+						  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-+						  flags, _RET_IP_);
-+			if (ret)
-+				goto err;
-+			goto out;
-+		}
-+next:
-+		bch2_btree_iter_advance(&iter);
-+		k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
-+		if ((ret = bkey_err(k)))
-+			goto err;
-+		if (!k.k)
-+			goto out;
-+	}
-+
-+	if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
-+		ret = extent_back_merge(trans, &iter, insert, k);
-+		if (ret)
-+			goto err;
-+	}
-+out:
-+	if (!bkey_deleted(&insert->k)) {
-+		/*
-+		 * Rewinding iterators is expensive: get a new one and the one
-+		 * that points to the start of insert will be cloned from:
-+		 */
-+		bch2_trans_iter_exit(trans, &iter);
-+		bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p,
-+				     BTREE_ITER_NOT_EXTENTS|
-+				     BTREE_ITER_INTENT);
-+		ret   = bch2_btree_iter_traverse(&iter) ?:
-+			bch2_trans_update(trans, &iter, insert, flags);
-+	}
-+err:
-+	bch2_trans_iter_exit(trans, &iter);
-+
-+	return ret;
-+}
-+
-+static noinline int flush_new_cached_update(struct btree_trans *trans,
-+					    struct btree_path *path,
-+					    struct btree_insert_entry *i,
-+					    enum btree_update_flags flags,
-+					    unsigned long ip)
-+{
-+	struct btree_path *btree_path;
-+	struct bkey k;
-+	int ret;
-+
-+	btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-+				   BTREE_ITER_INTENT, _THIS_IP_);
-+	ret = bch2_btree_path_traverse(trans, btree_path, 0);
-+	if (ret)
-+		goto out;
-+
-+	/*
-+	 * The old key in the insert entry might actually refer to an existing
-+	 * key in the btree that has been deleted from cache and not yet
-+	 * flushed. Check for this and skip the flush so we don't run triggers
-+	 * against a stale key.
-+	 */
-+	bch2_btree_path_peek_slot_exact(btree_path, &k);
-+	if (!bkey_deleted(&k))
-+		goto out;
-+
-+	i->key_cache_already_flushed = true;
-+	i->flags |= BTREE_TRIGGER_NORUN;
-+
-+	btree_path_set_should_be_locked(btree_path);
-+	ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
-+out:
-+	bch2_path_put(trans, btree_path, true);
-+	return ret;
-+}
-+
-+static int __must_check
-+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
-+			  struct bkey_i *k, enum btree_update_flags flags,
-+			  unsigned long ip)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_insert_entry *i, n;
-+	u64 seq = 0;
-+	int cmp;
-+
-+	EBUG_ON(!path->should_be_locked);
-+	EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
-+	EBUG_ON(!bpos_eq(k->k.p, path->pos));
-+
-+	/*
-+	 * The transaction journal res hasn't been allocated at this point.
-+	 * That occurs at commit time. Reuse the seq field to pass in the seq
-+	 * of a prejournaled key.
-+	 */
-+	if (flags & BTREE_UPDATE_PREJOURNAL)
-+		seq = trans->journal_res.seq;
-+
-+	n = (struct btree_insert_entry) {
-+		.flags		= flags,
-+		.bkey_type	= __btree_node_type(path->level, path->btree_id),
-+		.btree_id	= path->btree_id,
-+		.level		= path->level,
-+		.cached		= path->cached,
-+		.path		= path,
-+		.k		= k,
-+		.seq		= seq,
-+		.ip_allocated	= ip,
-+	};
-+
-+#ifdef CONFIG_BCACHEFS_DEBUG
-+	trans_for_each_update(trans, i)
-+		BUG_ON(i != trans->updates &&
-+		       btree_insert_entry_cmp(i - 1, i) >= 0);
-+#endif
-+
-+	/*
-+	 * Pending updates are kept sorted: first, find position of new update,
-+	 * then delete/trim any updates the new update overwrites:
-+	 */
-+	trans_for_each_update(trans, i) {
-+		cmp = btree_insert_entry_cmp(&n, i);
-+		if (cmp <= 0)
-+			break;
-+	}
-+
-+	if (!cmp && i < trans->updates + trans->nr_updates) {
-+		EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
-+
-+		bch2_path_put(trans, i->path, true);
-+		i->flags	= n.flags;
-+		i->cached	= n.cached;
-+		i->k		= n.k;
-+		i->path		= n.path;
-+		i->seq		= n.seq;
-+		i->ip_allocated	= n.ip_allocated;
-+	} else {
-+		array_insert_item(trans->updates, trans->nr_updates,
-+				  i - trans->updates, n);
-+
-+		i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
-+		i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
-+
-+		if (unlikely(trans->journal_replay_not_finished)) {
-+			struct bkey_i *j_k =
-+				bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
-+
-+			if (j_k) {
-+				i->old_k = j_k->k;
-+				i->old_v = &j_k->v;
-+			}
-+		}
-+	}
-+
-+	__btree_path_get(i->path, true);
-+
-+	/*
-+	 * If a key is present in the key cache, it must also exist in the
-+	 * btree - this is necessary for cache coherency. When iterating over
-+	 * a btree that's cached in the key cache, the btree iter code checks
-+	 * the key cache - but the key has to exist in the btree for that to
-+	 * work:
-+	 */
-+	if (path->cached && bkey_deleted(&i->old_k))
-+		return flush_new_cached_update(trans, path, i, flags, ip);
-+
-+	return 0;
-+}
-+
-+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
-+				   struct bkey_i *k, enum btree_update_flags flags)
-+{
-+	struct btree_path *path = iter->update_path ?: iter->path;
-+	struct bkey_cached *ck;
-+	int ret;
-+
-+	if (iter->flags & BTREE_ITER_IS_EXTENTS)
-+		return bch2_trans_update_extent(trans, iter, k, flags);
-+
-+	if (bkey_deleted(&k->k) &&
-+	    !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
-+	    (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
-+		ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
-+		if (unlikely(ret < 0))
-+			return ret;
-+
-+		if (ret)
-+			k->k.type = KEY_TYPE_whiteout;
-+	}
-+
-+	/*
-+	 * Ensure that updates to cached btrees go to the key cache:
-+	 */
-+	if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
-+	    !path->cached &&
-+	    !path->level &&
-+	    btree_id_cached(trans->c, path->btree_id)) {
-+		if (!iter->key_cache_path ||
-+		    !iter->key_cache_path->should_be_locked ||
-+		    !bpos_eq(iter->key_cache_path->pos, k->k.p)) {
-+			if (!iter->key_cache_path)
-+				iter->key_cache_path =
-+					bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-+						      BTREE_ITER_INTENT|
-+						      BTREE_ITER_CACHED, _THIS_IP_);
-+
-+			iter->key_cache_path =
-+				bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
-+							iter->flags & BTREE_ITER_INTENT,
-+							_THIS_IP_);
-+
-+			ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
-+						       BTREE_ITER_CACHED);
-+			if (unlikely(ret))
-+				return ret;
-+
-+			ck = (void *) iter->key_cache_path->l[0].b;
-+
-+			if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-+				trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
-+				return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
-+			}
-+
-+			btree_path_set_should_be_locked(iter->key_cache_path);
-+		}
-+
-+		path = iter->key_cache_path;
-+	}
-+
-+	return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
-+}
-+
-+/*
-+ * Add a transaction update for a key that has already been journaled.
-+ */
-+int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
-+				       struct btree_iter *iter, struct bkey_i *k,
-+				       enum btree_update_flags flags)
-+{
-+	trans->journal_res.seq = seq;
-+	return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
-+						 BTREE_UPDATE_PREJOURNAL);
-+}
-+
-+int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
-+					    enum btree_id btree,
-+					    struct bkey_i *k)
-+{
-+	struct btree_write_buffered_key *i;
-+	int ret;
-+
-+	EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
-+	EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
-+
-+	trans_for_each_wb_update(trans, i) {
-+		if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
-+			bkey_copy(&i->k, k);
-+			return 0;
-+		}
-+	}
-+
-+	if (!trans->wb_updates ||
-+	    trans->nr_wb_updates == trans->wb_updates_size) {
-+		struct btree_write_buffered_key *u;
-+
-+		if (trans->nr_wb_updates == trans->wb_updates_size) {
-+			struct btree_transaction_stats *s = btree_trans_stats(trans);
-+
-+			BUG_ON(trans->wb_updates_size > U8_MAX / 2);
-+			trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
-+			if (s)
-+				s->wb_updates_size = trans->wb_updates_size;
-+		}
-+
-+		u = bch2_trans_kmalloc_nomemzero(trans,
-+					trans->wb_updates_size *
-+					sizeof(struct btree_write_buffered_key));
-+		ret = PTR_ERR_OR_ZERO(u);
-+		if (ret)
-+			return ret;
-+
-+		if (trans->nr_wb_updates)
-+			memcpy(u, trans->wb_updates, trans->nr_wb_updates *
-+			       sizeof(struct btree_write_buffered_key));
-+		trans->wb_updates = u;
-+	}
-+
-+	trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
-+		.btree	= btree,
-+	};
-+
-+	bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
-+	trans->nr_wb_updates++;
-+
-+	return 0;
-+}
-+
-+int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
-+			     enum btree_id btree, struct bpos end)
-+{
-+	struct bkey_s_c k;
-+	int ret = 0;
-+
-+	bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
-+	k = bch2_btree_iter_prev(iter);
-+	ret = bkey_err(k);
-+	if (ret)
-+		goto err;
-+
-+	bch2_btree_iter_advance(iter);
-+	k = bch2_btree_iter_peek_slot(iter);
-+	ret = bkey_err(k);
-+	if (ret)
-+		goto err;
-+
-+	BUG_ON(k.k->type != KEY_TYPE_deleted);
-+
-+	if (bkey_gt(k.k->p, end)) {
-+		ret = -BCH_ERR_ENOSPC_btree_slot;
-+		goto err;
-+	}
-+
-+	return 0;
-+err:
-+	bch2_trans_iter_exit(trans, iter);
-+	return ret;
-+}
-+
-+void bch2_trans_commit_hook(struct btree_trans *trans,
-+			    struct btree_trans_commit_hook *h)
-+{
-+	h->next = trans->hooks;
-+	trans->hooks = h;
-+}
-+
-+int bch2_btree_insert_nonextent(struct btree_trans *trans,
-+				enum btree_id btree, struct bkey_i *k,
-+				enum btree_update_flags flags)
-+{
-+	struct btree_iter iter;
-+	int ret;
-+
-+	bch2_trans_iter_init(trans, &iter, btree, k->k.p,
-+			     BTREE_ITER_NOT_EXTENTS|
-+			     BTREE_ITER_INTENT);
-+	ret   = bch2_btree_iter_traverse(&iter) ?:
-+		bch2_trans_update(trans, &iter, k, flags);
-+	bch2_trans_iter_exit(trans, &iter);
-+	return ret;
-+}
-+
-+int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
-+			struct bkey_i *k, enum btree_update_flags flags)
-+{
-+	struct btree_iter iter;
-+	int ret;
-+
-+	bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
-+			     BTREE_ITER_CACHED|
-+			     BTREE_ITER_INTENT);
-+	ret   = bch2_btree_iter_traverse(&iter) ?:
-+		bch2_trans_update(trans, &iter, k, flags);
-+	bch2_trans_iter_exit(trans, &iter);
-+	return ret;
-+}
-+
-+/**
-+ * bch2_btree_insert - insert keys into the extent btree
-+ * @c:			pointer to struct bch_fs
-+ * @id:			btree to insert into
-+ * @insert_keys:	list of keys to insert
-+ * @hook:		insert callback
-+ */
-+int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
-+		      struct bkey_i *k,
-+		      struct disk_reservation *disk_res,
-+		      u64 *journal_seq, int flags)
-+{
-+	return bch2_trans_do(c, disk_res, journal_seq, flags,
-+			     __bch2_btree_insert(&trans, id, k, 0));
-+}
-+
-+int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
-+				unsigned len, unsigned update_flags)
-+{
-+	struct bkey_i *k;
-+
-+	k = bch2_trans_kmalloc(trans, sizeof(*k));
-+	if (IS_ERR(k))
-+		return PTR_ERR(k);
-+
-+	bkey_init(&k->k);
-+	k->k.p = iter->pos;
-+	bch2_key_resize(&k->k, len);
-+	return bch2_trans_update(trans, iter, k, update_flags);
-+}
-+
-+int bch2_btree_delete_at(struct btree_trans *trans,
-+			 struct btree_iter *iter, unsigned update_flags)
-+{
-+	return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
-+}
-+
-+int bch2_btree_delete_at_buffered(struct btree_trans *trans,
-+				  enum btree_id btree, struct bpos pos)
-+{
-+	struct bkey_i *k;
-+
-+	k = bch2_trans_kmalloc(trans, sizeof(*k));
-+	if (IS_ERR(k))
-+		return PTR_ERR(k);
-+
-+	bkey_init(&k->k);
-+	k->k.p = pos;
-+	return bch2_trans_update_buffered(trans, btree, k);
-+}
-+
-+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
-+				  struct bpos start, struct bpos end,
-+				  unsigned update_flags,
-+				  u64 *journal_seq)
-+{
-+	u32 restart_count = trans->restart_count;
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	int ret = 0;
-+
-+	bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
-+	while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
-+		struct disk_reservation disk_res =
-+			bch2_disk_reservation_init(trans->c, 0);
-+		struct bkey_i delete;
-+
-+		ret = bkey_err(k);
-+		if (ret)
-+			goto err;
-+
-+		bkey_init(&delete.k);
-+
-+		/*
-+		 * This could probably be more efficient for extents:
-+		 */
-+
-+		/*
-+		 * For extents, iter.pos won't necessarily be the same as
-+		 * bkey_start_pos(k.k) (for non extents they always will be the
-+		 * same). It's important that we delete starting from iter.pos
-+		 * because the range we want to delete could start in the middle
-+		 * of k.
-+		 *
-+		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
-+		 * bkey_start_pos(k.k)).
-+		 */
-+		delete.k.p = iter.pos;
-+
-+		if (iter.flags & BTREE_ITER_IS_EXTENTS)
-+			bch2_key_resize(&delete.k,
-+					bpos_min(end, k.k->p).offset -
-+					iter.pos.offset);
-+
-+		ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
-+			bch2_trans_commit(trans, &disk_res, journal_seq,
-+					  BTREE_INSERT_NOFAIL);
-+		bch2_disk_reservation_put(trans->c, &disk_res);
-+err:
-+		/*
-+		 * the bch2_trans_begin() call is in a weird place because we
-+		 * need to call it after every transaction commit, to avoid path
-+		 * overflow, but don't want to call it if the delete operation
-+		 * is a no-op and we have no work to do:
-+		 */
-+		bch2_trans_begin(trans);
-+
-+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-+			ret = 0;
-+		if (ret)
-+			break;
-+	}
-+	bch2_trans_iter_exit(trans, &iter);
-+
-+	if (!ret && trans_was_restarted(trans, restart_count))
-+		ret = -BCH_ERR_transaction_restart_nested;
-+	return ret;
-+}
-+
-+/*
-+ * bch_btree_delete_range - delete everything within a given range
-+ *
-+ * Range is a half open interval - [start, end)
-+ */
-+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
-+			    struct bpos start, struct bpos end,
-+			    unsigned update_flags,
-+			    u64 *journal_seq)
-+{
-+	int ret = bch2_trans_run(c,
-+			bch2_btree_delete_range_trans(&trans, id, start, end,
-+						      update_flags, journal_seq));
-+	if (ret == -BCH_ERR_transaction_restart_nested)
-+		ret = 0;
-+	return ret;
-+}
-+
-+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
-+		       struct bpos pos, bool set)
-+{
-+	struct bkey_i *k;
-+	int ret = 0;
-+
-+	k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
-+	ret = PTR_ERR_OR_ZERO(k);
-+	if (unlikely(ret))
-+		return ret;
-+
-+	bkey_init(&k->k);
-+	k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
-+	k->k.p = pos;
-+
-+	return bch2_trans_update_buffered(trans, btree, k);
-+}
-+
-+static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
-+{
-+	struct printbuf buf = PRINTBUF;
-+	struct jset_entry_log *l;
-+	unsigned u64s;
-+	int ret;
-+
-+	prt_vprintf(&buf, fmt, args);
-+	ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
-+	if (ret)
-+		goto err;
-+
-+	u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
-+
-+	ret = darray_make_room(entries, jset_u64s(u64s));
-+	if (ret)
-+		goto err;
-+
-+	l = (void *) &darray_top(*entries);
-+	l->entry.u64s		= cpu_to_le16(u64s);
-+	l->entry.btree_id	= 0;
-+	l->entry.level		= 1;
-+	l->entry.type		= BCH_JSET_ENTRY_log;
-+	l->entry.pad[0]		= 0;
-+	l->entry.pad[1]		= 0;
-+	l->entry.pad[2]		= 0;
-+	memcpy(l->d, buf.buf, buf.pos);
-+	while (buf.pos & 7)
-+		l->d[buf.pos++] = '\0';
-+
-+	entries->nr += jset_u64s(u64s);
-+err:
-+	printbuf_exit(&buf);
-+	return ret;
-+}
-+
-+static int
-+__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
-+		  va_list args)
-+{
-+	int ret;
-+
-+	if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
-+		ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
-+	} else {
-+		ret = bch2_trans_do(c, NULL, NULL,
-+			BTREE_INSERT_LAZY_RW|commit_flags,
-+			__bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
-+	}
-+
-+	return ret;
-+}
-+
-+int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
-+{
-+	va_list args;
-+	int ret;
-+
-+	va_start(args, fmt);
-+	ret = __bch2_fs_log_msg(c, 0, fmt, args);
-+	va_end(args);
-+	return ret;
-+}
-+
-+/*
-+ * Use for logging messages during recovery to enable reserved space and avoid
-+ * blocking.
-+ */
-+int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
-+{
-+	va_list args;
-+	int ret;
-+
-+	va_start(args, fmt);
-+	ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
-+	va_end(args);
-+	return ret;
-+}
 diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
 new file mode 100644
-index 000000000..5f96db539
+index 000000000..6d2d43b6f
 --- /dev/null
 +++ b/fs/bcachefs/btree_write_buffer.c
-@@ -0,0 +1,372 @@
+@@ -0,0 +1,375 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -35242,7 +35550,8 @@ index 000000000..5f96db539
 +	}
 +	return 0;
 +trans_commit:
-+	return  bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, 0) ?:
++	return  bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k,
++				      BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
 +		bch2_trans_commit(trans, NULL, NULL,
 +				  commit_flags|
 +				  BTREE_INSERT_NOCHECK_RW|
@@ -35291,7 +35600,8 @@ index 000000000..5f96db539
 +			     BTREE_ITER_CACHED|BTREE_ITER_INTENT);
 +
 +	ret   = bch2_btree_iter_traverse(&iter) ?:
-+		bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, 0);
++		bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k,
++				      BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 +	bch2_trans_iter_exit(trans, &iter);
 +	return ret;
 +}
@@ -35360,7 +35670,8 @@ index 000000000..5f96db539
 +
 +		if (!iter.path || iter.path->btree_id != i->btree) {
 +			bch2_trans_iter_exit(trans, &iter);
-+			bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT);
++			bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
++					     BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
 +		}
 +
 +		bch2_btree_iter_set_pos(&iter, i->k.k.p);
@@ -35609,10 +35920,10 @@ index 000000000..99993ba77
 +#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
 diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
 new file mode 100644
-index 000000000..7bb7f0cae
+index 000000000..c02c8c917
 --- /dev/null
 +++ b/fs/bcachefs/buckets.c
-@@ -0,0 +1,2106 @@
+@@ -0,0 +1,2107 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Code for manipulating bucket marks for garbage collection.
@@ -37539,6 +37850,7 @@ index 000000000..7bb7f0cae
 +int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
 +{
 +	int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
++
 +	if (ret)
 +		bch_err_fn(c, ret);
 +	return ret;
@@ -37721,10 +38033,10 @@ index 000000000..7bb7f0cae
 +}
 diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
 new file mode 100644
-index 000000000..a418f6648
+index 000000000..f192809f5
 --- /dev/null
 +++ b/fs/bcachefs/buckets.h
-@@ -0,0 +1,368 @@
+@@ -0,0 +1,413 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +/*
 + * Code for manipulating bucket marks for garbage collection.
@@ -37737,7 +38049,31 @@ index 000000000..a418f6648
 +
 +#include "buckets_types.h"
 +#include "extents.h"
-+#include "super.h"
++#include "sb-members.h"
++
++static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
++{
++	return div_u64(s, ca->mi.bucket_size);
++}
++
++static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
++{
++	return ((sector_t) b) * ca->mi.bucket_size;
++}
++
++static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
++{
++	u32 remainder;
++
++	div_u64_rem(s, ca->mi.bucket_size, &remainder);
++	return remainder;
++}
++
++static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
++						 u32 *offset)
++{
++	return div_u64_rem(s, ca->mi.bucket_size, offset);
++}
 +
 +#define for_each_bucket(_b, _buckets)				\
 +	for (_b = (_buckets)->b + (_buckets)->first_bucket;	\
@@ -38019,6 +38355,27 @@ index 000000000..a418f6648
 +				    size_t, enum bch_data_type, unsigned);
 +int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
 +
++static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
++{
++	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
++	u64 b_offset	= bucket_to_sector(ca, b);
++	u64 b_end	= bucket_to_sector(ca, b + 1);
++	unsigned i;
++
++	if (!b)
++		return true;
++
++	for (i = 0; i < layout->nr_superblocks; i++) {
++		u64 offset = le64_to_cpu(layout->sb_offset[i]);
++		u64 end = offset + (1 << layout->sb_max_size_bits);
++
++		if (!(offset >= b_end || end <= b_offset))
++			return true;
++	}
++
++	return false;
++}
++
 +/* disk reservations: */
 +
 +static inline void bch2_disk_reservation_put(struct bch_fs *c,
@@ -39190,7 +39547,7 @@ index 000000000..fb603df09
 +#endif /* NO_BCACHEFS_CHARDEV */
 diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
 new file mode 100644
-index 000000000..3a4890d39
+index 000000000..0f563ca53
 --- /dev/null
 +++ b/fs/bcachefs/chardev.h
 @@ -0,0 +1,31 @@
@@ -39213,7 +39570,7 @@ index 000000000..3a4890d39
 +static inline long bch2_fs_ioctl(struct bch_fs *c,
 +				unsigned cmd, void __user * arg)
 +{
-+	return -ENOSYS;
++	return -ENOTTY;
 +}
 +
 +static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
@@ -39227,10 +39584,10 @@ index 000000000..3a4890d39
 +#endif /* _BCACHEFS_CHARDEV_H */
 diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
 new file mode 100644
-index 000000000..a08997a5b
+index 000000000..36939020f
 --- /dev/null
 +++ b/fs/bcachefs/checksum.c
-@@ -0,0 +1,709 @@
+@@ -0,0 +1,753 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "checksum.h"
@@ -39498,9 +39855,10 @@ index 000000000..a08997a5b
 +
 +#ifdef CONFIG_HIGHMEM
 +		__bio_for_each_segment(bv, bio, *iter, *iter) {
-+			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
++			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
++
 +			bch2_checksum_update(&state, p, bv.bv_len);
-+			kunmap_atomic(p);
++			kunmap_local(p);
 +		}
 +#else
 +		__bio_for_each_bvec(bv, bio, *iter, *iter)
@@ -39520,10 +39878,10 @@ index 000000000..a08997a5b
 +
 +#ifdef CONFIG_HIGHMEM
 +		__bio_for_each_segment(bv, bio, *iter, *iter) {
-+			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
++			void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
 +
 +			crypto_shash_update(desc, p, bv.bv_len);
-+			kunmap_atomic(p);
++			kunmap_local(p);
 +		}
 +#else
 +		__bio_for_each_bvec(bv, bio, *iter, *iter)
@@ -39593,7 +39951,7 @@ index 000000000..a08997a5b
 +
 +	state.type = type;
 +	bch2_checksum_init(&state);
-+	state.seed = a.lo;
++	state.seed = (u64 __force) a.lo;
 +
 +	BUG_ON(!bch2_checksum_mergeable(type));
 +
@@ -39604,7 +39962,7 @@ index 000000000..a08997a5b
 +				page_address(ZERO_PAGE(0)), b);
 +		b_len -= b;
 +	}
-+	a.lo = bch2_checksum_final(&state);
++	a.lo = (__le64 __force) bch2_checksum_final(&state);
 +	a.lo ^= b.lo;
 +	a.hi ^= b.hi;
 +	return a;
@@ -39659,9 +40017,10 @@ index 000000000..a08997a5b
 +		merged = bch2_checksum_bio(c, crc_old.csum_type,
 +				extent_nonce(version, crc_old), bio);
 +
-+	if (bch2_crc_cmp(merged, crc_old.csum)) {
-+		bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n"
++	if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
++		bch_err(c, "checksum error in %s() (memory corruption or bug?)\n"
 +			"expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
++			__func__,
 +			crc_old.csum.hi,
 +			crc_old.csum.lo,
 +			merged.hi,
@@ -39691,6 +40050,48 @@ index 000000000..a08997a5b
 +	return 0;
 +}
 +
++/* BCH_SB_FIELD_crypt: */
++
++static int bch2_sb_crypt_validate(struct bch_sb *sb,
++				  struct bch_sb_field *f,
++				  struct printbuf *err)
++{
++	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
++
++	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
++		prt_printf(err, "wrong size (got %zu should be %zu)",
++		       vstruct_bytes(&crypt->field), sizeof(*crypt));
++		return -BCH_ERR_invalid_sb_crypt;
++	}
++
++	if (BCH_CRYPT_KDF_TYPE(crypt)) {
++		prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
++		return -BCH_ERR_invalid_sb_crypt;
++	}
++
++	return 0;
++}
++
++static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
++				  struct bch_sb_field *f)
++{
++	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
++
++	prt_printf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
++	prt_newline(out);
++	prt_printf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
++	prt_newline(out);
++	prt_printf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
++	prt_newline(out);
++	prt_printf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
++	prt_newline(out);
++}
++
++const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
++	.validate	= bch2_sb_crypt_validate,
++	.to_text	= bch2_sb_crypt_to_text,
++};
++
 +#ifdef __KERNEL__
 +static int __bch2_request_key(char *key_description, struct bch_key *key)
 +{
@@ -39830,7 +40231,7 @@ index 000000000..a08997a5b
 +	if (ret)
 +		goto out;
 +
-+	crypt->key.magic	= BCH_KEY_MAGIC;
++	crypt->key.magic	= cpu_to_le64(BCH_KEY_MAGIC);
 +	crypt->key.key		= key;
 +
 +	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
@@ -39858,7 +40259,7 @@ index 000000000..a08997a5b
 +	if (ret)
 +		goto err;
 +
-+	key.magic = BCH_KEY_MAGIC;
++	key.magic = cpu_to_le64(BCH_KEY_MAGIC);
 +	get_random_bytes(&key.key, sizeof(key.key));
 +
 +	if (keyed) {
@@ -39942,10 +40343,10 @@ index 000000000..a08997a5b
 +}
 diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
 new file mode 100644
-index 000000000..1ad1d5f03
+index 000000000..c7b1a8fca
 --- /dev/null
 +++ b/fs/bcachefs/checksum.h
-@@ -0,0 +1,209 @@
+@@ -0,0 +1,211 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_CHECKSUM_H
 +#define _BCACHEFS_CHECKSUM_H
@@ -40020,6 +40421,8 @@ index 000000000..1ad1d5f03
 +		: 0;
 +}
 +
++extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
++
 +int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
 +			struct bch_key *);
 +
@@ -40443,10 +40846,10 @@ index 000000000..5fae0012d
 +#endif /* _BCACHEFS_CLOCK_TYPES_H */
 diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
 new file mode 100644
-index 000000000..c9ca7cce5
+index 000000000..6b17f7cc5
 --- /dev/null
 +++ b/fs/bcachefs/compress.c
-@@ -0,0 +1,713 @@
+@@ -0,0 +1,714 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "checksum.h"
@@ -41092,7 +41495,8 @@ index 000000000..c9ca7cce5
 +static u64 compression_opt_to_feature(unsigned v)
 +{
 +	unsigned type = bch2_compression_decode(v).type;
-+	return 1ULL << bch2_compression_opt_to_feature[type];
++
++	return BIT_ULL(bch2_compression_opt_to_feature[type]);
 +}
 +
 +int bch2_fs_compress_init(struct bch_fs *c)
@@ -41359,7 +41763,7 @@ index 000000000..4778aa19b
 +#endif // _BCACHEFS_COUNTERS_H
 diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
 new file mode 100644
-index 000000000..d4485fa01
+index 000000000..114f86b45
 --- /dev/null
 +++ b/fs/bcachefs/darray.h
 @@ -0,0 +1,87 @@
@@ -41424,13 +41828,13 @@ index 000000000..d4485fa01
 +#define darray_first(_d)	((_d).data[0])
 +#define darray_last(_d)		((_d).data[(_d).nr - 1])
 +
-+#define darray_insert_item(_d, _pos, _item)				\
++#define darray_insert_item(_d, pos, _item)				\
 +({									\
-+	size_t pos = (_pos);						\
++	size_t _pos = (pos);						\
 +	int _ret = darray_make_room((_d), 1);				\
 +									\
 +	if (!_ret)							\
-+		array_insert_item((_d)->data, (_d)->nr, pos, (_item));	\
++		array_insert_item((_d)->data, (_d)->nr, _pos, (_item));	\
 +	_ret;								\
 +})
 +
@@ -41452,7 +41856,7 @@ index 000000000..d4485fa01
 +#endif /* _BCACHEFS_DARRAY_H */
 diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
 new file mode 100644
-index 000000000..cfc624463
+index 000000000..81518f20d
 --- /dev/null
 +++ b/fs/bcachefs/data_update.c
 @@ -0,0 +1,562 @@
@@ -41873,7 +42277,7 @@ index 000000000..cfc624463
 +			break;
 +	}
 +
-+	if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) {
++	if (closure_nr_remaining(&cl) != 1) {
 +		bch2_trans_unlock(trans);
 +		closure_sync(&cl);
 +	}
@@ -43070,10 +43474,10 @@ index 000000000..2c37143b5
 +#endif /* _BCACHEFS_DEBUG_H */
 diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
 new file mode 100644
-index 000000000..065ea59ee
+index 000000000..a7559ab03
 --- /dev/null
 +++ b/fs/bcachefs/dirent.c
-@@ -0,0 +1,565 @@
+@@ -0,0 +1,590 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -43089,12 +43493,25 @@ index 000000000..065ea59ee
 +
 +#include <linux/dcache.h>
 +
-+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
++static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
 +{
-+	unsigned len = bkey_val_bytes(d.k) -
-+		offsetof(struct bch_dirent, d_name);
++	unsigned bkey_u64s = bkey_val_u64s(d.k);
++	unsigned bkey_bytes = bkey_u64s * sizeof(u64);
++	u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
++#if CPU_BIG_ENDIAN
++	unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8;
++#else
++	unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8;
++#endif
 +
-+	return strnlen(d.v->d_name, len);
++	return bkey_bytes -
++		offsetof(struct bch_dirent, d_name) -
++		trailing_nuls;
++}
++
++struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
++{
++	return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
 +}
 +
 +static u64 bch2_dirent_hash(const struct bch_hash_info *info,
@@ -43117,7 +43534,7 @@ index 000000000..065ea59ee
 +static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
 +{
 +	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-+	struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
++	struct qstr name = bch2_dirent_get_name(d);
 +
 +	return bch2_dirent_hash(info, &name);
 +}
@@ -43125,20 +43542,20 @@ index 000000000..065ea59ee
 +static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
 +{
 +	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
-+	int len = bch2_dirent_name_bytes(l);
-+	const struct qstr *r = _r;
++	const struct qstr l_name = bch2_dirent_get_name(l);
++	const struct qstr *r_name = _r;
 +
-+	return len - r->len ?: memcmp(l.v->d_name, r->name, len);
++	return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len);
 +}
 +
 +static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
 +{
 +	struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
 +	struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
-+	int l_len = bch2_dirent_name_bytes(l);
-+	int r_len = bch2_dirent_name_bytes(r);
++	const struct qstr l_name = bch2_dirent_get_name(l);
++	const struct qstr r_name = bch2_dirent_get_name(r);
 +
-+	return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
++	return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len);
 +}
 +
 +static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
@@ -43165,37 +43582,45 @@ index 000000000..065ea59ee
 +			struct printbuf *err)
 +{
 +	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-+	unsigned len;
++	struct qstr d_name = bch2_dirent_get_name(d);
 +
-+	len = bch2_dirent_name_bytes(d);
-+	if (!len) {
++	if (!d_name.len) {
 +		prt_printf(err, "empty name");
 +		return -BCH_ERR_invalid_bkey;
 +	}
 +
-+	if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) {
++	if (bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len)) {
 +		prt_printf(err, "value too big (%zu > %u)",
-+		       bkey_val_u64s(k.k), dirent_val_u64s(len));
++		       bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
 +		return -BCH_ERR_invalid_bkey;
 +	}
 +
-+	if (len > BCH_NAME_MAX) {
++	/*
++	 * Check new keys don't exceed the max length
++	 * (older keys may be larger.)
++	 */
++	if ((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX) {
 +		prt_printf(err, "dirent name too big (%u > %u)",
-+		       len, BCH_NAME_MAX);
++		       d_name.len, BCH_NAME_MAX);
 +		return -BCH_ERR_invalid_bkey;
 +	}
 +
-+	if (len == 1 && !memcmp(d.v->d_name, ".", 1)) {
++	if (d_name.len != strnlen(d_name.name, d_name.len)) {
++		prt_printf(err, "dirent has stray data after name's NUL");
++		return -BCH_ERR_invalid_bkey;
++	}
++
++	if (d_name.len == 1 && !memcmp(d_name.name, ".", 1)) {
 +		prt_printf(err, "invalid name");
 +		return -BCH_ERR_invalid_bkey;
 +	}
 +
-+	if (len == 2 && !memcmp(d.v->d_name, "..", 2)) {
++	if (d_name.len == 2 && !memcmp(d_name.name, "..", 2)) {
 +		prt_printf(err, "invalid name");
 +		return -BCH_ERR_invalid_bkey;
 +	}
 +
-+	if (memchr(d.v->d_name, '/', len)) {
++	if (memchr(d_name.name, '/', d_name.len)) {
 +		prt_printf(err, "invalid name");
 +		return -BCH_ERR_invalid_bkey;
 +	}
@@ -43213,10 +43638,11 @@ index 000000000..065ea59ee
 +			 struct bkey_s_c k)
 +{
 +	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
++	struct qstr d_name = bch2_dirent_get_name(d);
 +
 +	prt_printf(out, "%.*s -> %llu type %s",
-+	       bch2_dirent_name_bytes(d),
-+	       d.v->d_name,
++	       d_name.len,
++	       d_name.name,
 +	       d.v->d_type != DT_SUBVOL
 +	       ? le64_to_cpu(d.v->d_inum)
 +	       : le32_to_cpu(d.v->d_child_subvol),
@@ -43583,6 +44009,7 @@ index 000000000..065ea59ee
 +	subvol_inum target;
 +	u32 snapshot;
 +	struct bkey_buf sk;
++	struct qstr name;
 +	int ret;
 +
 +	bch2_bkey_buf_init(&sk);
@@ -43613,9 +44040,11 @@ index 000000000..065ea59ee
 +		dirent = bkey_i_to_s_c_dirent(sk.k);
 +		bch2_trans_unlock(&trans);
 +
++		name = bch2_dirent_get_name(dirent);
++
 +		ctx->pos = dirent.k->p.offset;
-+		if (!dir_emit(ctx, dirent.v->d_name,
-+			      bch2_dirent_name_bytes(dirent),
++		if (!dir_emit(ctx, name.name,
++			      name.len,
 +			      target.inum,
 +			      vfs_d_type(dirent.v->d_type)))
 +			break;
@@ -43641,7 +44070,7 @@ index 000000000..065ea59ee
 +}
 diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
 new file mode 100644
-index 000000000..b42f4a13b
+index 000000000..e9fa1df38
 --- /dev/null
 +++ b/fs/bcachefs/dirent.h
 @@ -0,0 +1,70 @@
@@ -43671,7 +44100,7 @@ index 000000000..b42f4a13b
 +struct bch_hash_info;
 +struct bch_inode_info;
 +
-+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
++struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
 +
 +static inline unsigned dirent_val_u64s(unsigned len)
 +{
@@ -43717,13 +44146,14 @@ index 000000000..b42f4a13b
 +#endif /* _BCACHEFS_DIRENT_H */
 diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
 new file mode 100644
-index 000000000..de14ca3a9
+index 000000000..f36472c4a
 --- /dev/null
 +++ b/fs/bcachefs/disk_groups.c
-@@ -0,0 +1,555 @@
+@@ -0,0 +1,556 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "disk_groups.h"
++#include "sb-members.h"
 +#include "super-io.h"
 +
 +#include <linux/sort.h>
@@ -44390,10 +44820,10 @@ index 000000000..bd7711767
 +#endif /* _BCACHEFS_DISK_GROUPS_H */
 diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
 new file mode 100644
-index 000000000..efbb7cf7a
+index 000000000..f58e84a2b
 --- /dev/null
 +++ b/fs/bcachefs/ec.c
-@@ -0,0 +1,1960 @@
+@@ -0,0 +1,1972 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +/* erasure coding */
@@ -44596,11 +45026,14 @@ index 000000000..efbb7cf7a
 +
 +static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
 +{
-+	unsigned i;
++	if (buf->key.k.type == KEY_TYPE_stripe) {
++		struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key);
++		unsigned i;
 +
-+	for (i = 0; i < buf->key.v.nr_blocks; i++) {
-+		kvpfree(buf->data[i], buf->size << 9);
-+		buf->data[i] = NULL;
++		for (i = 0; i < s->v.nr_blocks; i++) {
++			kvpfree(buf->data[i], buf->size << 9);
++			buf->data[i] = NULL;
++		}
 +	}
 +}
 +
@@ -44608,7 +45041,7 @@ index 000000000..efbb7cf7a
 +static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
 +			      unsigned offset, unsigned size)
 +{
-+	struct bch_stripe *v = &buf->key.v;
++	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
 +	unsigned csum_granularity = 1U << v->csum_granularity_bits;
 +	unsigned end = offset + size;
 +	unsigned i;
@@ -44624,7 +45057,7 @@ index 000000000..efbb7cf7a
 +
 +	memset(buf->valid, 0xFF, sizeof(buf->valid));
 +
-+	for (i = 0; i < buf->key.v.nr_blocks; i++) {
++	for (i = 0; i < v->nr_blocks; i++) {
 +		buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
 +		if (!buf->data[i])
 +			goto err;
@@ -44641,7 +45074,7 @@ index 000000000..efbb7cf7a
 +static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
 +					 unsigned block, unsigned offset)
 +{
-+	struct bch_stripe *v = &buf->key.v;
++	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
 +	unsigned csum_granularity = 1 << v->csum_granularity_bits;
 +	unsigned end = buf->offset + buf->size;
 +	unsigned len = min(csum_granularity, end - offset);
@@ -44660,7 +45093,7 @@ index 000000000..efbb7cf7a
 +
 +static void ec_generate_checksums(struct ec_stripe_buf *buf)
 +{
-+	struct bch_stripe *v = &buf->key.v;
++	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
 +	unsigned i, j, csums_per_device = stripe_csums_per_device(v);
 +
 +	if (!v->csum_type)
@@ -44677,7 +45110,7 @@ index 000000000..efbb7cf7a
 +
 +static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
 +{
-+	struct bch_stripe *v = &buf->key.v;
++	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
 +	unsigned csum_granularity = 1 << v->csum_granularity_bits;
 +	unsigned i;
 +
@@ -44700,7 +45133,7 @@ index 000000000..efbb7cf7a
 +			if (bch2_crc_cmp(want, got)) {
 +				struct printbuf buf2 = PRINTBUF;
 +
-+				bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i));
++				bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key));
 +
 +				bch_err_ratelimited(c,
 +					"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
@@ -44720,7 +45153,7 @@ index 000000000..efbb7cf7a
 +
 +static void ec_generate_ec(struct ec_stripe_buf *buf)
 +{
-+	struct bch_stripe *v = &buf->key.v;
++	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
 +	unsigned nr_data = v->nr_blocks - v->nr_redundant;
 +	unsigned bytes = le16_to_cpu(v->sectors) << 9;
 +
@@ -44729,13 +45162,14 @@ index 000000000..efbb7cf7a
 +
 +static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
 +{
-+	return buf->key.v.nr_blocks -
-+		bitmap_weight(buf->valid, buf->key.v.nr_blocks);
++	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
++
++	return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks);
 +}
 +
 +static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
 +{
-+	struct bch_stripe *v = &buf->key.v;
++	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
 +	unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
 +	unsigned nr_data = v->nr_blocks - v->nr_redundant;
 +	unsigned bytes = buf->size << 9;
@@ -44759,7 +45193,7 @@ index 000000000..efbb7cf7a
 +static void ec_block_endio(struct bio *bio)
 +{
 +	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
-+	struct bch_stripe *v = &ec_bio->buf->key.v;
++	struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v;
 +	struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
 +	struct bch_dev *ca = ec_bio->ca;
 +	struct closure *cl = bio->bi_private;
@@ -44784,11 +45218,11 @@ index 000000000..efbb7cf7a
 +static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
 +			blk_opf_t opf, unsigned idx, struct closure *cl)
 +{
-+	struct bch_stripe *v = &buf->key.v;
++	struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
 +	unsigned offset = 0, bytes = buf->size << 9;
 +	struct bch_extent_ptr *ptr = &v->ptrs[idx];
 +	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-+	enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant
++	enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant
 +		? BCH_DATA_user
 +		: BCH_DATA_parity;
 +	int rw = op_is_write(opf);
@@ -44859,7 +45293,7 @@ index 000000000..efbb7cf7a
 +		ret = -ENOENT;
 +		goto err;
 +	}
-+	bkey_reassemble(&stripe->key.k_i, k);
++	bkey_reassemble(&stripe->key, k);
 +err:
 +	bch2_trans_iter_exit(trans, &iter);
 +	return ret;
@@ -44895,7 +45329,7 @@ index 000000000..efbb7cf7a
 +		return -EIO;
 +	}
 +
-+	v = &buf->key.v;
++	v = &bkey_i_to_stripe(&buf->key)->v;
 +
 +	if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
 +		bch_err_ratelimited(c,
@@ -45271,6 +45705,7 @@ index 000000000..efbb7cf7a
 +				   struct ec_stripe_buf *s,
 +				   struct bpos *bp_pos)
 +{
++	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
 +	struct bch_fs *c = trans->c;
 +	struct bch_backpointer bp;
 +	struct btree_iter iter;
@@ -45322,7 +45757,7 @@ index 000000000..efbb7cf7a
 +	if (extent_has_stripe_ptr(k, s->key.k.p.offset))
 +		goto out;
 +
-+	ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
++	ptr_c = bkey_matches_stripe(v, k, &block);
 +	/*
 +	 * It doesn't generally make sense to erasure code cached ptrs:
 +	 * XXX: should we be incrementing a counter?
@@ -45330,7 +45765,7 @@ index 000000000..efbb7cf7a
 +	if (!ptr_c || ptr_c->cached)
 +		goto out;
 +
-+	dev = s->key.v.ptrs[block].dev;
++	dev = v->ptrs[block].dev;
 +
 +	n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr));
 +	ret = PTR_ERR_OR_ZERO(n);
@@ -45346,7 +45781,7 @@ index 000000000..efbb7cf7a
 +	stripe_ptr = (struct bch_extent_stripe_ptr) {
 +		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
 +		.block		= block,
-+		.redundancy	= s->key.v.nr_redundant,
++		.redundancy	= v->nr_redundant,
 +		.idx		= s->key.k.p.offset,
 +	};
 +
@@ -45364,7 +45799,8 @@ index 000000000..efbb7cf7a
 +				   unsigned block)
 +{
 +	struct bch_fs *c = trans->c;
-+	struct bch_extent_ptr bucket = s->key.v.ptrs[block];
++	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
++	struct bch_extent_ptr bucket = v->ptrs[block];
 +	struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket);
 +	struct bpos bp_pos = POS_MIN;
 +	int ret = 0;
@@ -45389,7 +45825,7 @@ index 000000000..efbb7cf7a
 +static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
 +{
 +	struct btree_trans trans;
-+	struct bch_stripe *v = &s->key.v;
++	struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
 +	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
 +	int ret = 0;
 +
@@ -45453,7 +45889,7 @@ index 000000000..efbb7cf7a
 +{
 +	struct bch_fs *c = s->c;
 +	struct open_bucket *ob;
-+	struct bch_stripe *v = &s->new_stripe.key.v;
++	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
 +	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
 +	int ret;
 +
@@ -45486,7 +45922,7 @@ index 000000000..efbb7cf7a
 +		}
 +
 +		for (i = 0; i < nr_data; i++)
-+			if (stripe_blockcount_get(&s->existing_stripe.key.v, i))
++			if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i))
 +				swap(s->new_stripe.data[i],
 +				     s->existing_stripe.data[i]);
 +
@@ -45513,8 +45949,9 @@ index 000000000..efbb7cf7a
 +	ret = bch2_trans_do(c, &s->res, NULL,
 +			    BTREE_INSERT_NOCHECK_RW|
 +			    BTREE_INSERT_NOFAIL,
-+			    ec_stripe_key_update(&trans, &s->new_stripe.key,
-+						 !s->have_existing_stripe));
++			    ec_stripe_key_update(&trans,
++					bkey_i_to_stripe(&s->new_stripe.key),
++					!s->have_existing_stripe));
 +	if (ret) {
 +		bch_err(c, "error creating stripe: error creating stripe key");
 +		goto err;
@@ -45675,14 +46112,14 @@ index 000000000..efbb7cf7a
 +}
 +
 +static void ec_stripe_key_init(struct bch_fs *c,
-+			       struct bkey_i_stripe *s,
++			       struct bkey_i *k,
 +			       unsigned nr_data,
 +			       unsigned nr_parity,
 +			       unsigned stripe_size)
 +{
++	struct bkey_i_stripe *s = bkey_stripe_init(k);
 +	unsigned u64s;
 +
-+	bkey_stripe_init(&s->k_i);
 +	s->v.sectors			= cpu_to_le16(stripe_size);
 +	s->v.algorithm			= 0;
 +	s->v.nr_blocks			= nr_data + nr_parity;
@@ -45721,8 +46158,8 @@ index 000000000..efbb7cf7a
 +				BCH_BKEY_PTRS_MAX) - h->redundancy;
 +	s->nr_parity	= h->redundancy;
 +
-+	ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
-+			   s->nr_parity, h->blocksize);
++	ec_stripe_key_init(c, &s->new_stripe.key,
++			   s->nr_data, s->nr_parity, h->blocksize);
 +
 +	h->s = s;
 +	return 0;
@@ -45825,15 +46262,16 @@ index 000000000..efbb7cf7a
 +	struct bch_devs_mask devs = h->devs;
 +	struct open_bucket *ob;
 +	struct open_buckets buckets;
++	struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
 +	unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
 +	bool have_cache = true;
 +	int ret = 0;
 +
-+	BUG_ON(h->s->new_stripe.key.v.nr_blocks		!= h->s->nr_data + h->s->nr_parity);
-+	BUG_ON(h->s->new_stripe.key.v.nr_redundant	!= h->s->nr_parity);
++	BUG_ON(v->nr_blocks	!= h->s->nr_data + h->s->nr_parity);
++	BUG_ON(v->nr_redundant	!= h->s->nr_parity);
 +
-+	for_each_set_bit(i, h->s->blocks_gotten, h->s->new_stripe.key.v.nr_blocks) {
-+		__clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
++	for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
++		__clear_bit(v->ptrs[i].dev, devs.d);
 +		if (i < h->s->nr_data)
 +			nr_have_data++;
 +		else
@@ -45862,7 +46300,7 @@ index 000000000..efbb7cf7a
 +			BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
 +
 +			h->s->blocks[j] = buckets.v[i];
-+			h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
++			v->ptrs[j] = bch2_ob_ptr(c, ob);
 +			__set_bit(j, h->s->blocks_gotten);
 +		}
 +
@@ -45888,7 +46326,7 @@ index 000000000..efbb7cf7a
 +			BUG_ON(j >= h->s->nr_data);
 +
 +			h->s->blocks[j] = buckets.v[i];
-+			h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
++			v->ptrs[j] = bch2_ob_ptr(c, ob);
 +			__set_bit(j, h->s->blocks_gotten);
 +		}
 +
@@ -45938,6 +46376,8 @@ index 000000000..efbb7cf7a
 +static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h)
 +{
 +	struct bch_fs *c = trans->c;
++	struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v;
++	struct bch_stripe *existing_v;
 +	unsigned i;
 +	s64 idx;
 +	int ret;
@@ -45958,9 +46398,11 @@ index 000000000..efbb7cf7a
 +		return ret;
 +	}
 +
-+	BUG_ON(h->s->existing_stripe.key.v.nr_redundant != h->s->nr_parity);
-+	h->s->nr_data = h->s->existing_stripe.key.v.nr_blocks -
-+		h->s->existing_stripe.key.v.nr_redundant;
++	existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v;
++
++	BUG_ON(existing_v->nr_redundant != h->s->nr_parity);
++	h->s->nr_data = existing_v->nr_blocks -
++		existing_v->nr_redundant;
 +
 +	ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize);
 +	if (ret) {
@@ -45969,21 +46411,21 @@ index 000000000..efbb7cf7a
 +	}
 +
 +	BUG_ON(h->s->existing_stripe.size != h->blocksize);
-+	BUG_ON(h->s->existing_stripe.size != le16_to_cpu(h->s->existing_stripe.key.v.sectors));
++	BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors));
 +
 +	/*
 +	 * Free buckets we initially allocated - they might conflict with
 +	 * blocks from the stripe we're reusing:
 +	 */
-+	for_each_set_bit(i, h->s->blocks_gotten, h->s->new_stripe.key.v.nr_blocks) {
++	for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) {
 +		bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]);
 +		h->s->blocks[i] = 0;
 +	}
 +	memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten));
 +	memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated));
 +
-+	for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
-+		if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) {
++	for (i = 0; i < existing_v->nr_blocks; i++) {
++		if (stripe_blockcount_get(existing_v, i)) {
 +			__set_bit(i, h->s->blocks_gotten);
 +			__set_bit(i, h->s->blocks_allocated);
 +		}
@@ -45991,7 +46433,7 @@ index 000000000..efbb7cf7a
 +		ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
 +	}
 +
-+	bkey_copy(&h->s->new_stripe.key.k_i, &h->s->existing_stripe.key.k_i);
++	bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key);
 +	h->s->have_existing_stripe = true;
 +
 +	return 0;
@@ -46160,7 +46602,7 @@ index 000000000..efbb7cf7a
 +		if (!ca)
 +			goto found;
 +
-+		for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
++		for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) {
 +			if (!h->s->blocks[i])
 +				continue;
 +
@@ -46318,7 +46760,7 @@ index 000000000..efbb7cf7a
 +			break;
 +
 +		if (h->s) {
-+			for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++)
++			for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++)
 +				BUG_ON(h->s->blocks[i]);
 +
 +			kfree(h->s);
@@ -46356,10 +46798,10 @@ index 000000000..efbb7cf7a
 +}
 diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
 new file mode 100644
-index 000000000..1b1848e5f
+index 000000000..885ae5d51
 --- /dev/null
 +++ b/fs/bcachefs/ec.h
-@@ -0,0 +1,263 @@
+@@ -0,0 +1,260 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_EC_H
 +#define _BCACHEFS_EC_H
@@ -46500,10 +46942,7 @@ index 000000000..1b1848e5f
 +
 +	void			*data[BCH_BKEY_PTRS_MAX];
 +
-+	union {
-+		struct bkey_i_stripe	key;
-+		u64			pad[255];
-+	};
++	__BKEY_PADDED(key, 255);
 +};
 +
 +struct ec_stripe_head;
@@ -46741,10 +47180,10 @@ index 000000000..dc906fc91
 +}
 diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
 new file mode 100644
-index 000000000..735eb2416
+index 000000000..f7fa87442
 --- /dev/null
 +++ b/fs/bcachefs/errcode.h
-@@ -0,0 +1,246 @@
+@@ -0,0 +1,252 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_ERRCODE_H
 +#define _BCACHEFS_ERRCODE_H
@@ -46960,6 +47399,12 @@ index 000000000..735eb2416
 +	x(BCH_ERR_invalid_sb,		invalid_sb_quota)			\
 +	x(BCH_ERR_invalid,		invalid_bkey)				\
 +	x(BCH_ERR_operation_blocked,    nocow_lock_blocked)			\
++	x(EIO,				btree_node_read_err)			\
++	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_fixable)		\
++	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_want_retry)		\
++	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_must_retry)		\
++	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_bad_node)		\
++	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_incompatible)
 +
 +enum bch_errcode {
 +	BCH_ERR_START		= 2048,
@@ -47702,10 +48147,10 @@ index 000000000..6f5cf4493
 +#endif /* _BCACHEFS_EXTENT_UPDATE_H */
 diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
 new file mode 100644
-index 000000000..c13e0afc6
+index 000000000..1b25f84e4
 --- /dev/null
 +++ b/fs/bcachefs/extents.c
-@@ -0,0 +1,1394 @@
+@@ -0,0 +1,1403 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
@@ -48225,13 +48670,13 @@ index 000000000..c13e0afc6
 +	switch (type) {
 +	case BCH_EXTENT_ENTRY_crc32:
 +		set_common_fields(dst->crc32, src);
-+		memcpy(&dst->crc32.csum, &src.csum.lo, sizeof(dst->crc32.csum));
++		dst->crc32.csum		= (u32 __force) *((__le32 *) &src.csum.lo);
 +		break;
 +	case BCH_EXTENT_ENTRY_crc64:
 +		set_common_fields(dst->crc64, src);
 +		dst->crc64.nonce	= src.nonce;
-+		dst->crc64.csum_lo	= src.csum.lo;
-+		dst->crc64.csum_hi	= *((__le16 *) &src.csum.hi);
++		dst->crc64.csum_lo	= (u64 __force) src.csum.lo;
++		dst->crc64.csum_hi	= (u64 __force) *((__le16 *) &src.csum.hi);
 +		break;
 +	case BCH_EXTENT_ENTRY_crc128:
 +		set_common_fields(dst->crc128, src);
@@ -48623,11 +49068,11 @@ index 000000000..c13e0afc6
 +
 +		bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
 +			bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
-+			if (p1.ptr.dev		== p2.ptr.dev &&
-+			    p1.ptr.gen		== p2.ptr.gen &&
-+			    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
-+			    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
-+				return true;
++				if (p1.ptr.dev		== p2.ptr.dev &&
++				    p1.ptr.gen		== p2.ptr.gen &&
++				    (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
++				    (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
++					return true;
 +
 +		return false;
 +	} else {
@@ -48767,6 +49212,7 @@ index 000000000..c13e0afc6
 +
 +static int extent_ptr_invalid(const struct bch_fs *c,
 +			      struct bkey_s_c k,
++			      enum bkey_invalid_flags flags,
 +			      const struct bch_extent_ptr *ptr,
 +			      unsigned size_ondisk,
 +			      bool metadata,
@@ -48779,6 +49225,14 @@ index 000000000..c13e0afc6
 +	struct bch_dev *ca;
 +
 +	if (!bch2_dev_exists2(c, ptr->dev)) {
++		/*
++		 * If we're in the write path this key might have already been
++		 * overwritten, and we could be seeing a device that doesn't
++		 * exist anymore due to racing with device removal:
++		 */
++		if (flags & BKEY_INVALID_WRITE)
++			return 0;
++
 +		prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
 +		return -BCH_ERR_invalid_bkey;
 +	}
@@ -48844,8 +49298,8 @@ index 000000000..c13e0afc6
 +
 +		switch (extent_entry_type(entry)) {
 +		case BCH_EXTENT_ENTRY_ptr:
-+			ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk,
-+						 false, err);
++			ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
++						 size_ondisk, false, err);
 +			if (ret)
 +				return ret;
 +
@@ -49102,7 +49556,7 @@ index 000000000..c13e0afc6
 +}
 diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
 new file mode 100644
-index 000000000..d359b3fda
+index 000000000..7ee8d031b
 --- /dev/null
 +++ b/fs/bcachefs/extents.h
 @@ -0,0 +1,757 @@
@@ -49263,7 +49717,7 @@ index 000000000..d359b3fda
 +			common_fields(crc->crc32),
 +		};
 +
-+		memcpy(&ret.csum.lo, &crc->crc32.csum, sizeof(crc->crc32.csum));
++		*((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
 +		return ret;
 +	}
 +	case BCH_EXTENT_ENTRY_crc64: {
@@ -49273,8 +49727,8 @@ index 000000000..d359b3fda
 +			.csum.lo		= (__force __le64) crc->crc64.csum_lo,
 +		};
 +
-+		u16 hi = crc->crc64.csum_hi;
-+		memcpy(&ret.csum.hi, &hi, sizeof(hi));
++		*((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
++
 +		return ret;
 +	}
 +	case BCH_EXTENT_ENTRY_crc128: {
@@ -49796,7 +50250,7 @@ index 000000000..d359b3fda
 +void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
 +			    struct bkey_s_c);
 +int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
-+			   unsigned, struct printbuf *);
++			   enum bkey_invalid_flags, struct printbuf *);
 +
 +void bch2_ptr_swab(struct bkey_s);
 +
@@ -50885,221 +51339,28 @@ index 000000000..dde237859
 +			  struct bch_inode_unpacked *);
 +
 +#endif /* _BCACHEFS_FS_COMMON_H */
-diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
+diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
 new file mode 100644
-index 000000000..6b691b2b5
+index 000000000..dc22182d5
 --- /dev/null
-+++ b/fs/bcachefs/fs-io.c
-@@ -0,0 +1,3982 @@
++++ b/fs/bcachefs/fs-io-buffered.c
+@@ -0,0 +1,1099 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#ifndef NO_BCACHEFS_FS
 +
 +#include "bcachefs.h"
 +#include "alloc_foreground.h"
 +#include "bkey_buf.h"
-+#include "btree_update.h"
-+#include "buckets.h"
-+#include "clock.h"
-+#include "error.h"
-+#include "extents.h"
-+#include "extent_update.h"
-+#include "fs.h"
 +#include "fs-io.h"
-+#include "fsck.h"
-+#include "inode.h"
-+#include "journal.h"
++#include "fs-io-buffered.h"
++#include "fs-io-direct.h"
++#include "fs-io-pagecache.h"
 +#include "io.h"
-+#include "keylist.h"
-+#include "quota.h"
-+#include "reflink.h"
-+#include "trace.h"
 +
-+#include <linux/aio.h>
 +#include <linux/backing-dev.h>
-+#include <linux/falloc.h>
-+#include <linux/migrate.h>
-+#include <linux/mmu_context.h>
-+#include <linux/pagevec.h>
-+#include <linux/rmap.h>
-+#include <linux/sched/signal.h>
-+#include <linux/task_io_accounting_ops.h>
-+#include <linux/uio.h>
++#include <linux/pagemap.h>
 +#include <linux/writeback.h>
 +
-+#include <trace/events/writeback.h>
-+
-+static void bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned);
-+
-+struct folio_vec {
-+	struct folio	*fv_folio;
-+	size_t		fv_offset;
-+	size_t		fv_len;
-+};
-+
-+static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
-+{
-+
-+	struct folio *folio	= page_folio(bv.bv_page);
-+	size_t offset		= (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
-+		bv.bv_offset;
-+	size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
-+
-+	return (struct folio_vec) {
-+		.fv_folio	= folio,
-+		.fv_offset	= offset,
-+		.fv_len		= len,
-+	};
-+}
-+
-+static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
-+						    struct bvec_iter iter)
-+{
-+	return biovec_to_foliovec(bio_iter_iovec(bio, iter));
-+}
-+
-+#define __bio_for_each_folio(bvl, bio, iter, start)			\
-+	for (iter = (start);						\
-+	     (iter).bi_size &&						\
-+		((bvl = bio_iter_iovec_folio((bio), (iter))), 1);	\
-+	     bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
-+
-+/**
-+ * bio_for_each_folio - iterate over folios within a bio
-+ *
-+ * Like other non-_all versions, this iterates over what bio->bi_iter currently
-+ * points to. This version is for drivers, where the bio may have previously
-+ * been split or cloned.
-+ */
-+#define bio_for_each_folio(bvl, bio, iter)				\
-+	__bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
-+
-+/*
-+ * Use u64 for the end pos and sector helpers because if the folio covers the
-+ * max supported range of the mapping, the start offset of the next folio
-+ * overflows loff_t. This breaks much of the range based processing in the
-+ * buffered write path.
-+ */
-+static inline u64 folio_end_pos(struct folio *folio)
-+{
-+	return folio_pos(folio) + folio_size(folio);
-+}
-+
-+static inline size_t folio_sectors(struct folio *folio)
-+{
-+	return PAGE_SECTORS << folio_order(folio);
-+}
-+
-+static inline loff_t folio_sector(struct folio *folio)
-+{
-+	return folio_pos(folio) >> 9;
-+}
-+
-+static inline u64 folio_end_sector(struct folio *folio)
-+{
-+	return folio_end_pos(folio) >> 9;
-+}
-+
-+typedef DARRAY(struct folio *) folios;
-+
-+static int filemap_get_contig_folios_d(struct address_space *mapping,
-+				       loff_t start, u64 end,
-+				       int fgp_flags, gfp_t gfp,
-+				       folios *folios)
-+{
-+	struct folio *f;
-+	u64 pos = start;
-+	int ret = 0;
-+
-+	while (pos < end) {
-+		if ((u64) pos >= (u64) start + (1ULL << 20))
-+			fgp_flags &= ~FGP_CREAT;
-+
-+		ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
-+		if (ret)
-+			break;
-+
-+		f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
-+		if (IS_ERR_OR_NULL(f))
-+			break;
-+
-+		BUG_ON(folios->nr && folio_pos(f) != pos);
-+
-+		pos = folio_end_pos(f);
-+		darray_push(folios, f);
-+	}
-+
-+	if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
-+		ret = -ENOMEM;
-+
-+	return folios->nr ? 0 : ret;
-+}
-+
-+struct nocow_flush {
-+	struct closure	*cl;
-+	struct bch_dev	*ca;
-+	struct bio	bio;
-+};
-+
-+static void nocow_flush_endio(struct bio *_bio)
-+{
-+
-+	struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
-+
-+	closure_put(bio->cl);
-+	percpu_ref_put(&bio->ca->io_ref);
-+	bio_put(&bio->bio);
-+}
-+
-+static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
-+						struct bch_inode_info *inode,
-+						struct closure *cl)
-+{
-+	struct nocow_flush *bio;
-+	struct bch_dev *ca;
-+	struct bch_devs_mask devs;
-+	unsigned dev;
-+
-+	dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
-+	if (dev == BCH_SB_MEMBERS_MAX)
-+		return;
-+
-+	devs = inode->ei_devs_need_flush;
-+	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
-+
-+	for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
-+		rcu_read_lock();
-+		ca = rcu_dereference(c->devs[dev]);
-+		if (ca && !percpu_ref_tryget(&ca->io_ref))
-+			ca = NULL;
-+		rcu_read_unlock();
-+
-+		if (!ca)
-+			continue;
-+
-+		bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
-+						    REQ_OP_FLUSH,
-+						    GFP_KERNEL,
-+						    &c->nocow_flush_bioset),
-+				   struct nocow_flush, bio);
-+		bio->cl			= cl;
-+		bio->ca			= ca;
-+		bio->bio.bi_end_io	= nocow_flush_endio;
-+		closure_bio_submit(&bio->bio, cl);
-+	}
-+}
-+
-+static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
-+					 struct bch_inode_info *inode)
-+{
-+	struct closure cl;
-+
-+	closure_init_stack(&cl);
-+	bch2_inode_flush_nocow_writes_async(c, inode, &cl);
-+	closure_sync(&cl);
-+
-+	return 0;
-+}
-+
 +static inline bool bio_full(struct bio *bio, unsigned len)
 +{
 +	if (bio->bi_vcnt >= bio->bi_max_vecs)
@@ -51109,893 +51370,6 @@ index 000000000..6b691b2b5
 +	return false;
 +}
 +
-+static inline struct address_space *faults_disabled_mapping(void)
-+{
-+	return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
-+}
-+
-+static inline void set_fdm_dropped_locks(void)
-+{
-+	current->faults_disabled_mapping =
-+		(void *) (((unsigned long) current->faults_disabled_mapping)|1);
-+}
-+
-+static inline bool fdm_dropped_locks(void)
-+{
-+	return ((unsigned long) current->faults_disabled_mapping) & 1;
-+}
-+
-+struct quota_res {
-+	u64				sectors;
-+};
-+
-+struct bch_writepage_io {
-+	struct bch_inode_info		*inode;
-+
-+	/* must be last: */
-+	struct bch_write_op		op;
-+};
-+
-+struct dio_write {
-+	struct kiocb			*req;
-+	struct address_space		*mapping;
-+	struct bch_inode_info		*inode;
-+	struct mm_struct		*mm;
-+	unsigned			loop:1,
-+					extending:1,
-+					sync:1,
-+					flush:1,
-+					free_iov:1;
-+	struct quota_res		quota_res;
-+	u64				written;
-+
-+	struct iov_iter			iter;
-+	struct iovec			inline_vecs[2];
-+
-+	/* must be last: */
-+	struct bch_write_op		op;
-+};
-+
-+struct dio_read {
-+	struct closure			cl;
-+	struct kiocb			*req;
-+	long				ret;
-+	bool				should_dirty;
-+	struct bch_read_bio		rbio;
-+};
-+
-+/* pagecache_block must be held */
-+static noinline int write_invalidate_inode_pages_range(struct address_space *mapping,
-+					      loff_t start, loff_t end)
-+{
-+	int ret;
-+
-+	/*
-+	 * XXX: the way this is currently implemented, we can spin if a process
-+	 * is continually redirtying a specific page
-+	 */
-+	do {
-+		if (!mapping->nrpages)
-+			return 0;
-+
-+		ret = filemap_write_and_wait_range(mapping, start, end);
-+		if (ret)
-+			break;
-+
-+		if (!mapping->nrpages)
-+			return 0;
-+
-+		ret = invalidate_inode_pages2_range(mapping,
-+				start >> PAGE_SHIFT,
-+				end >> PAGE_SHIFT);
-+	} while (ret == -EBUSY);
-+
-+	return ret;
-+}
-+
-+/* quotas */
-+
-+#ifdef CONFIG_BCACHEFS_QUOTA
-+
-+static void __bch2_quota_reservation_put(struct bch_fs *c,
-+					 struct bch_inode_info *inode,
-+					 struct quota_res *res)
-+{
-+	BUG_ON(res->sectors > inode->ei_quota_reserved);
-+
-+	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
-+			-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
-+	inode->ei_quota_reserved -= res->sectors;
-+	res->sectors = 0;
-+}
-+
-+static void bch2_quota_reservation_put(struct bch_fs *c,
-+				       struct bch_inode_info *inode,
-+				       struct quota_res *res)
-+{
-+	if (res->sectors) {
-+		mutex_lock(&inode->ei_quota_lock);
-+		__bch2_quota_reservation_put(c, inode, res);
-+		mutex_unlock(&inode->ei_quota_lock);
-+	}
-+}
-+
-+static int bch2_quota_reservation_add(struct bch_fs *c,
-+				      struct bch_inode_info *inode,
-+				      struct quota_res *res,
-+				      u64 sectors,
-+				      bool check_enospc)
-+{
-+	int ret;
-+
-+	if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
-+		return 0;
-+
-+	mutex_lock(&inode->ei_quota_lock);
-+	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
-+			      check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
-+	if (likely(!ret)) {
-+		inode->ei_quota_reserved += sectors;
-+		res->sectors += sectors;
-+	}
-+	mutex_unlock(&inode->ei_quota_lock);
-+
-+	return ret;
-+}
-+
-+#else
-+
-+static void __bch2_quota_reservation_put(struct bch_fs *c,
-+					 struct bch_inode_info *inode,
-+					 struct quota_res *res) {}
-+
-+static void bch2_quota_reservation_put(struct bch_fs *c,
-+				       struct bch_inode_info *inode,
-+				       struct quota_res *res) {}
-+
-+static int bch2_quota_reservation_add(struct bch_fs *c,
-+				      struct bch_inode_info *inode,
-+				      struct quota_res *res,
-+				      unsigned sectors,
-+				      bool check_enospc)
-+{
-+	return 0;
-+}
-+
-+#endif
-+
-+/* i_size updates: */
-+
-+struct inode_new_size {
-+	loff_t		new_size;
-+	u64		now;
-+	unsigned	fields;
-+};
-+
-+static int inode_set_size(struct bch_inode_info *inode,
-+			  struct bch_inode_unpacked *bi,
-+			  void *p)
-+{
-+	struct inode_new_size *s = p;
-+
-+	bi->bi_size = s->new_size;
-+	if (s->fields & ATTR_ATIME)
-+		bi->bi_atime = s->now;
-+	if (s->fields & ATTR_MTIME)
-+		bi->bi_mtime = s->now;
-+	if (s->fields & ATTR_CTIME)
-+		bi->bi_ctime = s->now;
-+
-+	return 0;
-+}
-+
-+int __must_check bch2_write_inode_size(struct bch_fs *c,
-+				       struct bch_inode_info *inode,
-+				       loff_t new_size, unsigned fields)
-+{
-+	struct inode_new_size s = {
-+		.new_size	= new_size,
-+		.now		= bch2_current_time(c),
-+		.fields		= fields,
-+	};
-+
-+	return bch2_write_inode(c, inode, inode_set_size, &s, fields);
-+}
-+
-+static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-+			   struct quota_res *quota_res, s64 sectors)
-+{
-+	bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
-+				"inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
-+				inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
-+				inode->ei_inode.bi_sectors);
-+	inode->v.i_blocks += sectors;
-+
-+#ifdef CONFIG_BCACHEFS_QUOTA
-+	if (quota_res &&
-+	    !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
-+	    sectors > 0) {
-+		BUG_ON(sectors > quota_res->sectors);
-+		BUG_ON(sectors > inode->ei_quota_reserved);
-+
-+		quota_res->sectors -= sectors;
-+		inode->ei_quota_reserved -= sectors;
-+	} else {
-+		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
-+	}
-+#endif
-+}
-+
-+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-+			   struct quota_res *quota_res, s64 sectors)
-+{
-+	if (sectors) {
-+		mutex_lock(&inode->ei_quota_lock);
-+		__i_sectors_acct(c, inode, quota_res, sectors);
-+		mutex_unlock(&inode->ei_quota_lock);
-+	}
-+}
-+
-+/* page state: */
-+
-+/* stored in page->private: */
-+
-+#define BCH_FOLIO_SECTOR_STATE()	\
-+	x(unallocated)			\
-+	x(reserved)			\
-+	x(dirty)			\
-+	x(dirty_reserved)		\
-+	x(allocated)
-+
-+enum bch_folio_sector_state {
-+#define x(n)	SECTOR_##n,
-+	BCH_FOLIO_SECTOR_STATE()
-+#undef x
-+};
-+
-+static const char * const bch2_folio_sector_states[] = {
-+#define x(n)	#n,
-+	BCH_FOLIO_SECTOR_STATE()
-+#undef x
-+	NULL
-+};
-+
-+static inline enum bch_folio_sector_state
-+folio_sector_dirty(enum bch_folio_sector_state state)
-+{
-+	switch (state) {
-+	case SECTOR_unallocated:
-+		return SECTOR_dirty;
-+	case SECTOR_reserved:
-+		return SECTOR_dirty_reserved;
-+	default:
-+		return state;
-+	}
-+}
-+
-+static inline enum bch_folio_sector_state
-+folio_sector_undirty(enum bch_folio_sector_state state)
-+{
-+	switch (state) {
-+	case SECTOR_dirty:
-+		return SECTOR_unallocated;
-+	case SECTOR_dirty_reserved:
-+		return SECTOR_reserved;
-+	default:
-+		return state;
-+	}
-+}
-+
-+static inline enum bch_folio_sector_state
-+folio_sector_reserve(enum bch_folio_sector_state state)
-+{
-+	switch (state) {
-+	case SECTOR_unallocated:
-+		return SECTOR_reserved;
-+	case SECTOR_dirty:
-+		return SECTOR_dirty_reserved;
-+	default:
-+		return state;
-+	}
-+}
-+
-+struct bch_folio_sector {
-+	/* Uncompressed, fully allocated replicas (or on disk reservation): */
-+	unsigned		nr_replicas:4;
-+
-+	/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
-+	unsigned		replicas_reserved:4;
-+
-+	/* i_sectors: */
-+	enum bch_folio_sector_state state:8;
-+};
-+
-+struct bch_folio {
-+	spinlock_t		lock;
-+	atomic_t		write_count;
-+	/*
-+	 * Is the sector state up to date with the btree?
-+	 * (Not the data itself)
-+	 */
-+	bool			uptodate;
-+	struct bch_folio_sector	s[];
-+};
-+
-+static inline void folio_sector_set(struct folio *folio,
-+			     struct bch_folio *s,
-+			     unsigned i, unsigned n)
-+{
-+	s->s[i].state = n;
-+}
-+
-+/* file offset (to folio offset) to bch_folio_sector index */
-+static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
-+{
-+	u64 f_offset = pos - folio_pos(folio);
-+	BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
-+	return f_offset >> SECTOR_SHIFT;
-+}
-+
-+static inline struct bch_folio *__bch2_folio(struct folio *folio)
-+{
-+	return folio_has_private(folio)
-+		? (struct bch_folio *) folio_get_private(folio)
-+		: NULL;
-+}
-+
-+static inline struct bch_folio *bch2_folio(struct folio *folio)
-+{
-+	EBUG_ON(!folio_test_locked(folio));
-+
-+	return __bch2_folio(folio);
-+}
-+
-+/* for newly allocated folios: */
-+static void __bch2_folio_release(struct folio *folio)
-+{
-+	kfree(folio_detach_private(folio));
-+}
-+
-+static void bch2_folio_release(struct folio *folio)
-+{
-+	EBUG_ON(!folio_test_locked(folio));
-+	__bch2_folio_release(folio);
-+}
-+
-+/* for newly allocated folios: */
-+static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
-+{
-+	struct bch_folio *s;
-+
-+	s = kzalloc(sizeof(*s) +
-+		    sizeof(struct bch_folio_sector) *
-+		    folio_sectors(folio), gfp);
-+	if (!s)
-+		return NULL;
-+
-+	spin_lock_init(&s->lock);
-+	folio_attach_private(folio, s);
-+	return s;
-+}
-+
-+static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
-+{
-+	return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
-+}
-+
-+static unsigned bkey_to_sector_state(struct bkey_s_c k)
-+{
-+	if (bkey_extent_is_reservation(k))
-+		return SECTOR_reserved;
-+	if (bkey_extent_is_allocation(k.k))
-+		return SECTOR_allocated;
-+	return SECTOR_unallocated;
-+}
-+
-+static void __bch2_folio_set(struct folio *folio,
-+			     unsigned pg_offset, unsigned pg_len,
-+			     unsigned nr_ptrs, unsigned state)
-+{
-+	struct bch_folio *s = bch2_folio(folio);
-+	unsigned i, sectors = folio_sectors(folio);
-+
-+	BUG_ON(pg_offset >= sectors);
-+	BUG_ON(pg_offset + pg_len > sectors);
-+
-+	spin_lock(&s->lock);
-+
-+	for (i = pg_offset; i < pg_offset + pg_len; i++) {
-+		s->s[i].nr_replicas	= nr_ptrs;
-+		folio_sector_set(folio, s, i, state);
-+	}
-+
-+	if (i == sectors)
-+		s->uptodate = true;
-+
-+	spin_unlock(&s->lock);
-+}
-+
-+/*
-+ * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
-+ * extents btree:
-+ */
-+static int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
-+			  struct folio **folios, unsigned nr_folios)
-+{
-+	struct btree_trans trans;
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	struct bch_folio *s;
-+	u64 offset = folio_sector(folios[0]);
-+	unsigned folio_idx;
-+	u32 snapshot;
-+	bool need_set = false;
-+	int ret;
-+
-+	for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
-+		s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
-+		if (!s)
-+			return -ENOMEM;
-+
-+		need_set |= !s->uptodate;
-+	}
-+
-+	if (!need_set)
-+		return 0;
-+
-+	folio_idx = 0;
-+	bch2_trans_init(&trans, c, 0, 0);
-+retry:
-+	bch2_trans_begin(&trans);
-+
-+	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
-+	if (ret)
-+		goto err;
-+
-+	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
-+			   SPOS(inum.inum, offset, snapshot),
-+			   BTREE_ITER_SLOTS, k, ret) {
-+		unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
-+		unsigned state = bkey_to_sector_state(k);
-+
-+		while (folio_idx < nr_folios) {
-+			struct folio *folio = folios[folio_idx];
-+			u64 folio_start	= folio_sector(folio);
-+			u64 folio_end	= folio_end_sector(folio);
-+			unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start;
-+			unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start;
-+
-+			BUG_ON(k.k->p.offset < folio_start);
-+			BUG_ON(bkey_start_offset(k.k) > folio_end);
-+
-+			if (!bch2_folio(folio)->uptodate)
-+				__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
-+
-+			if (k.k->p.offset < folio_end)
-+				break;
-+			folio_idx++;
-+		}
-+
-+		if (folio_idx == nr_folios)
-+			break;
-+	}
-+
-+	offset = iter.pos.offset;
-+	bch2_trans_iter_exit(&trans, &iter);
-+err:
-+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-+		goto retry;
-+	bch2_trans_exit(&trans);
-+
-+	return ret;
-+}
-+
-+static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
-+{
-+	struct bvec_iter iter;
-+	struct folio_vec fv;
-+	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
-+		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
-+	unsigned state = bkey_to_sector_state(k);
-+
-+	bio_for_each_folio(fv, bio, iter)
-+		__bch2_folio_set(fv.fv_folio,
-+				 fv.fv_offset >> 9,
-+				 fv.fv_len >> 9,
-+				 nr_ptrs, state);
-+}
-+
-+static void mark_pagecache_unallocated(struct bch_inode_info *inode,
-+				       u64 start, u64 end)
-+{
-+	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
-+	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-+	struct folio_batch fbatch;
-+	unsigned i, j;
-+
-+	if (end <= start)
-+		return;
-+
-+	folio_batch_init(&fbatch);
-+
-+	while (filemap_get_folios(inode->v.i_mapping,
-+				  &index, end_index, &fbatch)) {
-+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-+			struct folio *folio = fbatch.folios[i];
-+			u64 folio_start = folio_sector(folio);
-+			u64 folio_end = folio_end_sector(folio);
-+			unsigned folio_offset = max(start, folio_start) - folio_start;
-+			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-+			struct bch_folio *s;
-+
-+			BUG_ON(end <= folio_start);
-+
-+			folio_lock(folio);
-+			s = bch2_folio(folio);
-+
-+			if (s) {
-+				spin_lock(&s->lock);
-+				for (j = folio_offset; j < folio_offset + folio_len; j++)
-+					s->s[j].nr_replicas = 0;
-+				spin_unlock(&s->lock);
-+			}
-+
-+			folio_unlock(folio);
-+		}
-+		folio_batch_release(&fbatch);
-+		cond_resched();
-+	}
-+}
-+
-+static void mark_pagecache_reserved(struct bch_inode_info *inode,
-+				    u64 start, u64 end)
-+{
-+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-+	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
-+	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-+	struct folio_batch fbatch;
-+	s64 i_sectors_delta = 0;
-+	unsigned i, j;
-+
-+	if (end <= start)
-+		return;
-+
-+	folio_batch_init(&fbatch);
-+
-+	while (filemap_get_folios(inode->v.i_mapping,
-+				  &index, end_index, &fbatch)) {
-+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-+			struct folio *folio = fbatch.folios[i];
-+			u64 folio_start = folio_sector(folio);
-+			u64 folio_end = folio_end_sector(folio);
-+			unsigned folio_offset = max(start, folio_start) - folio_start;
-+			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-+			struct bch_folio *s;
-+
-+			BUG_ON(end <= folio_start);
-+
-+			folio_lock(folio);
-+			s = bch2_folio(folio);
-+
-+			if (s) {
-+				spin_lock(&s->lock);
-+				for (j = folio_offset; j < folio_offset + folio_len; j++) {
-+					i_sectors_delta -= s->s[j].state == SECTOR_dirty;
-+					folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state));
-+				}
-+				spin_unlock(&s->lock);
-+			}
-+
-+			folio_unlock(folio);
-+		}
-+		folio_batch_release(&fbatch);
-+		cond_resched();
-+	}
-+
-+	i_sectors_acct(c, inode, NULL, i_sectors_delta);
-+}
-+
-+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
-+{
-+	/* XXX: this should not be open coded */
-+	return inode->ei_inode.bi_data_replicas
-+		? inode->ei_inode.bi_data_replicas - 1
-+		: c->opts.data_replicas;
-+}
-+
-+static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
-+					  unsigned nr_replicas)
-+{
-+	return max(0, (int) nr_replicas -
-+		   s->nr_replicas -
-+		   s->replicas_reserved);
-+}
-+
-+static int bch2_get_folio_disk_reservation(struct bch_fs *c,
-+				struct bch_inode_info *inode,
-+				struct folio *folio, bool check_enospc)
-+{
-+	struct bch_folio *s = bch2_folio_create(folio, 0);
-+	unsigned nr_replicas = inode_nr_replicas(c, inode);
-+	struct disk_reservation disk_res = { 0 };
-+	unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
-+	int ret;
-+
-+	if (!s)
-+		return -ENOMEM;
-+
-+	for (i = 0; i < sectors; i++)
-+		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
-+
-+	if (!disk_res_sectors)
-+		return 0;
-+
-+	ret = bch2_disk_reservation_get(c, &disk_res,
-+					disk_res_sectors, 1,
-+					!check_enospc
-+					? BCH_DISK_RESERVATION_NOFAIL
-+					: 0);
-+	if (unlikely(ret))
-+		return ret;
-+
-+	for (i = 0; i < sectors; i++)
-+		s->s[i].replicas_reserved +=
-+			sectors_to_reserve(&s->s[i], nr_replicas);
-+
-+	return 0;
-+}
-+
-+struct bch2_folio_reservation {
-+	struct disk_reservation	disk;
-+	struct quota_res	quota;
-+};
-+
-+static void bch2_folio_reservation_init(struct bch_fs *c,
-+			struct bch_inode_info *inode,
-+			struct bch2_folio_reservation *res)
-+{
-+	memset(res, 0, sizeof(*res));
-+
-+	res->disk.nr_replicas = inode_nr_replicas(c, inode);
-+}
-+
-+static void bch2_folio_reservation_put(struct bch_fs *c,
-+			struct bch_inode_info *inode,
-+			struct bch2_folio_reservation *res)
-+{
-+	bch2_disk_reservation_put(c, &res->disk);
-+	bch2_quota_reservation_put(c, inode, &res->quota);
-+}
-+
-+static int bch2_folio_reservation_get(struct bch_fs *c,
-+			struct bch_inode_info *inode,
-+			struct folio *folio,
-+			struct bch2_folio_reservation *res,
-+			unsigned offset, unsigned len)
-+{
-+	struct bch_folio *s = bch2_folio_create(folio, 0);
-+	unsigned i, disk_sectors = 0, quota_sectors = 0;
-+	int ret;
-+
-+	if (!s)
-+		return -ENOMEM;
-+
-+	BUG_ON(!s->uptodate);
-+
-+	for (i = round_down(offset, block_bytes(c)) >> 9;
-+	     i < round_up(offset + len, block_bytes(c)) >> 9;
-+	     i++) {
-+		disk_sectors += sectors_to_reserve(&s->s[i],
-+						res->disk.nr_replicas);
-+		quota_sectors += s->s[i].state == SECTOR_unallocated;
-+	}
-+
-+	if (disk_sectors) {
-+		ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
-+		if (unlikely(ret))
-+			return ret;
-+	}
-+
-+	if (quota_sectors) {
-+		ret = bch2_quota_reservation_add(c, inode, &res->quota,
-+						 quota_sectors, true);
-+		if (unlikely(ret)) {
-+			struct disk_reservation tmp = {
-+				.sectors = disk_sectors
-+			};
-+
-+			bch2_disk_reservation_put(c, &tmp);
-+			res->disk.sectors -= disk_sectors;
-+			return ret;
-+		}
-+	}
-+
-+	return 0;
-+}
-+
-+static void bch2_clear_folio_bits(struct folio *folio)
-+{
-+	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
-+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-+	struct bch_folio *s = bch2_folio(folio);
-+	struct disk_reservation disk_res = { 0 };
-+	int i, sectors = folio_sectors(folio), dirty_sectors = 0;
-+
-+	if (!s)
-+		return;
-+
-+	EBUG_ON(!folio_test_locked(folio));
-+	EBUG_ON(folio_test_writeback(folio));
-+
-+	for (i = 0; i < sectors; i++) {
-+		disk_res.sectors += s->s[i].replicas_reserved;
-+		s->s[i].replicas_reserved = 0;
-+
-+		dirty_sectors -= s->s[i].state == SECTOR_dirty;
-+		folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
-+	}
-+
-+	bch2_disk_reservation_put(c, &disk_res);
-+
-+	i_sectors_acct(c, inode, NULL, dirty_sectors);
-+
-+	bch2_folio_release(folio);
-+}
-+
-+static void bch2_set_folio_dirty(struct bch_fs *c,
-+			struct bch_inode_info *inode,
-+			struct folio *folio,
-+			struct bch2_folio_reservation *res,
-+			unsigned offset, unsigned len)
-+{
-+	struct bch_folio *s = bch2_folio(folio);
-+	unsigned i, dirty_sectors = 0;
-+
-+	WARN_ON((u64) folio_pos(folio) + offset + len >
-+		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
-+
-+	BUG_ON(!s->uptodate);
-+
-+	spin_lock(&s->lock);
-+
-+	for (i = round_down(offset, block_bytes(c)) >> 9;
-+	     i < round_up(offset + len, block_bytes(c)) >> 9;
-+	     i++) {
-+		unsigned sectors = sectors_to_reserve(&s->s[i],
-+						res->disk.nr_replicas);
-+
-+		/*
-+		 * This can happen if we race with the error path in
-+		 * bch2_writepage_io_done():
-+		 */
-+		sectors = min_t(unsigned, sectors, res->disk.sectors);
-+
-+		s->s[i].replicas_reserved += sectors;
-+		res->disk.sectors -= sectors;
-+
-+		dirty_sectors += s->s[i].state == SECTOR_unallocated;
-+
-+		folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
-+	}
-+
-+	spin_unlock(&s->lock);
-+
-+	i_sectors_acct(c, inode, &res->quota, dirty_sectors);
-+
-+	if (!folio_test_dirty(folio))
-+		filemap_dirty_folio(inode->v.i_mapping, folio);
-+}
-+
-+vm_fault_t bch2_page_fault(struct vm_fault *vmf)
-+{
-+	struct file *file = vmf->vma->vm_file;
-+	struct address_space *mapping = file->f_mapping;
-+	struct address_space *fdm = faults_disabled_mapping();
-+	struct bch_inode_info *inode = file_bch_inode(file);
-+	vm_fault_t ret;
-+
-+	if (fdm == mapping)
-+		return VM_FAULT_SIGBUS;
-+
-+	/* Lock ordering: */
-+	if (fdm > mapping) {
-+		struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
-+
-+		if (bch2_pagecache_add_tryget(inode))
-+			goto got_lock;
-+
-+		bch2_pagecache_block_put(fdm_host);
-+
-+		bch2_pagecache_add_get(inode);
-+		bch2_pagecache_add_put(inode);
-+
-+		bch2_pagecache_block_get(fdm_host);
-+
-+		/* Signal that lock has been dropped: */
-+		set_fdm_dropped_locks();
-+		return VM_FAULT_SIGBUS;
-+	}
-+
-+	bch2_pagecache_add_get(inode);
-+got_lock:
-+	ret = filemap_fault(vmf);
-+	bch2_pagecache_add_put(inode);
-+
-+	return ret;
-+}
-+
-+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
-+{
-+	struct folio *folio = page_folio(vmf->page);
-+	struct file *file = vmf->vma->vm_file;
-+	struct bch_inode_info *inode = file_bch_inode(file);
-+	struct address_space *mapping = file->f_mapping;
-+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
-+	struct bch2_folio_reservation res;
-+	unsigned len;
-+	loff_t isize;
-+	vm_fault_t ret;
-+
-+	bch2_folio_reservation_init(c, inode, &res);
-+
-+	sb_start_pagefault(inode->v.i_sb);
-+	file_update_time(file);
-+
-+	/*
-+	 * Not strictly necessary, but helps avoid dio writes livelocking in
-+	 * write_invalidate_inode_pages_range() - can drop this if/when we get
-+	 * a write_invalidate_inode_pages_range() that works without dropping
-+	 * page lock before invalidating page
-+	 */
-+	bch2_pagecache_add_get(inode);
-+
-+	folio_lock(folio);
-+	isize = i_size_read(&inode->v);
-+
-+	if (folio->mapping != mapping || folio_pos(folio) >= isize) {
-+		folio_unlock(folio);
-+		ret = VM_FAULT_NOPAGE;
-+		goto out;
-+	}
-+
-+	len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
-+
-+	if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
-+	    bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
-+		folio_unlock(folio);
-+		ret = VM_FAULT_SIGBUS;
-+		goto out;
-+	}
-+
-+	bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
-+	bch2_folio_reservation_put(c, inode, &res);
-+
-+	folio_wait_stable(folio);
-+	ret = VM_FAULT_LOCKED;
-+out:
-+	bch2_pagecache_add_put(inode);
-+	sb_end_pagefault(inode->v.i_sb);
-+
-+	return ret;
-+}
-+
-+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
-+{
-+	if (offset || length < folio_size(folio))
-+		return;
-+
-+	bch2_clear_folio_bits(folio);
-+}
-+
-+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
-+{
-+	if (folio_test_dirty(folio) || folio_test_writeback(folio))
-+		return false;
-+
-+	bch2_clear_folio_bits(folio);
-+	return true;
-+}
-+
 +/* readpage(s): */
 +
 +static void bch2_readpages_end_io(struct bio *bio)
@@ -52031,7 +51405,7 @@ index 000000000..6b691b2b5
 +
 +	iter->mapping = ractl->mapping;
 +
-+	ret = filemap_get_contig_folios_d(iter->mapping,
++	ret = bch2_filemap_get_contig_folios_d(iter->mapping,
 +				ractl->_index << PAGE_SHIFT,
 +				(ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
 +				0, mapping_gfp_mask(iter->mapping),
@@ -52304,8 +51678,7 @@ index 000000000..6b691b2b5
 +	complete(bio->bi_private);
 +}
 +
-+static int bch2_read_single_folio(struct folio *folio,
-+				  struct address_space *mapping)
++int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
 +{
 +	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 +	struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -52345,6 +51718,13 @@ index 000000000..6b691b2b5
 +
 +/* writepages: */
 +
++struct bch_writepage_io {
++	struct bch_inode_info		*inode;
++
++	/* must be last: */
++	struct bch_write_op		op;
++};
++
 +struct bch_writepage_state {
 +	struct bch_writepage_io	*io;
 +	struct bch_io_opts	opts;
@@ -52416,7 +51796,7 @@ index 000000000..6b691b2b5
 +	 * PageWriteback is effectively our ref on the inode - fixup i_blocks
 +	 * before calling end_page_writeback:
 +	 */
-+	i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
++	bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
 +
 +	bio_for_each_folio_all(fi, bio) {
 +		struct bch_folio *s = __bch2_folio(fi.folio);
@@ -52510,8 +51890,7 @@ index 000000000..6b691b2b5
 +
 +	if (f_sectors > w->tmp_sectors) {
 +		kfree(w->tmp);
-+		w->tmp = kzalloc(sizeof(struct bch_folio_sector) *
-+				 f_sectors, __GFP_NOFAIL);
++		w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL);
 +		w->tmp_sectors = f_sectors;
 +	}
 +
@@ -52543,7 +51922,7 @@ index 000000000..6b691b2b5
 +			? 0 : nr_replicas_this_write;
 +
 +		s->s[i].replicas_reserved = 0;
-+		folio_sector_set(folio, s, i, SECTOR_allocated);
++		bch2_folio_sector_set(folio, s, i, SECTOR_allocated);
 +	}
 +	spin_unlock(&s->lock);
 +
@@ -52797,7 +52176,7 @@ index 000000000..6b691b2b5
 +	bch2_folio_reservation_init(c, inode, &res);
 +	darray_init(&folios);
 +
-+	ret = filemap_get_contig_folios_d(mapping, pos, end,
++	ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
 +				   FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
 +				   mapping_gfp_mask(mapping),
 +				   &folios);
@@ -52877,6 +52256,7 @@ index 000000000..6b691b2b5
 +		if (!folio_test_uptodate(f) &&
 +		    f_copied != folio_size(f) &&
 +		    pos + copied + f_copied < inode->v.i_size) {
++			iov_iter_revert(iter, f_copied);
 +			folio_zero_range(f, 0, folio_size(f));
 +			folios_trunc(&folios, fi);
 +			break;
@@ -53011,8 +52391,123 @@ index 000000000..6b691b2b5
 +	return written ? written : ret;
 +}
 +
++ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
++{
++	struct file *file = iocb->ki_filp;
++	struct bch_inode_info *inode = file_bch_inode(file);
++	ssize_t ret;
++
++	if (iocb->ki_flags & IOCB_DIRECT) {
++		ret = bch2_direct_write(iocb, from);
++		goto out;
++	}
++
++	inode_lock(&inode->v);
++
++	ret = generic_write_checks(iocb, from);
++	if (ret <= 0)
++		goto unlock;
++
++	ret = file_remove_privs(file);
++	if (ret)
++		goto unlock;
++
++	ret = file_update_time(file);
++	if (ret)
++		goto unlock;
++
++	ret = bch2_buffered_write(iocb, from);
++	if (likely(ret > 0))
++		iocb->ki_pos += ret;
++unlock:
++	inode_unlock(&inode->v);
++
++	if (ret > 0)
++		ret = generic_write_sync(iocb, ret);
++out:
++	return bch2_err_class(ret);
++}
++
++void bch2_fs_fs_io_buffered_exit(struct bch_fs *c)
++{
++	bioset_exit(&c->writepage_bioset);
++}
++
++int bch2_fs_fs_io_buffered_init(struct bch_fs *c)
++{
++	if (bioset_init(&c->writepage_bioset,
++			4, offsetof(struct bch_writepage_io, op.wbio.bio),
++			BIOSET_NEED_BVECS))
++		return -BCH_ERR_ENOMEM_writepage_bioset_init;
++
++	return 0;
++}
++
++#endif /* NO_BCACHEFS_FS */
+diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
+new file mode 100644
+index 000000000..a6126ff79
+--- /dev/null
++++ b/fs/bcachefs/fs-io-buffered.h
+@@ -0,0 +1,27 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_FS_IO_BUFFERED_H
++#define _BCACHEFS_FS_IO_BUFFERED_H
++
++#ifndef NO_BCACHEFS_FS
++
++int bch2_read_single_folio(struct folio *, struct address_space *);
++int bch2_read_folio(struct file *, struct folio *);
++
++int bch2_writepages(struct address_space *, struct writeback_control *);
++void bch2_readahead(struct readahead_control *);
++
++int bch2_write_begin(struct file *, struct address_space *, loff_t,
++		     unsigned, struct page **, void **);
++int bch2_write_end(struct file *, struct address_space *, loff_t,
++		   unsigned, unsigned, struct page *, void *);
++
++ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
++
++void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
++int bch2_fs_fs_io_buffered_init(struct bch_fs *);
++#else
++static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {}
++static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; }
++#endif
++
++#endif /* _BCACHEFS_FS_IO_BUFFERED_H */
+diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
+new file mode 100644
+index 000000000..2b29abd24
+--- /dev/null
++++ b/fs/bcachefs/fs-io-direct.c
+@@ -0,0 +1,679 @@
++// SPDX-License-Identifier: GPL-2.0
++#ifndef NO_BCACHEFS_FS
++
++#include "bcachefs.h"
++#include "alloc_foreground.h"
++#include "fs.h"
++#include "fs-io.h"
++#include "fs-io-direct.h"
++#include "fs-io-pagecache.h"
++#include "io.h"
++
++#include <linux/kthread.h>
++#include <linux/pagemap.h>
++#include <linux/task_io_accounting_ops.h>
++
 +/* O_DIRECT reads */
 +
++struct dio_read {
++	struct closure			cl;
++	struct kiocb			*req;
++	long				ret;
++	bool				should_dirty;
++	struct bch_read_bio		rbio;
++};
++
 +static void bio_check_or_release(struct bio *bio, bool check_dirty)
 +{
 +	if (check_dirty) {
@@ -53198,6 +52693,26 @@ index 000000000..6b691b2b5
 +
 +/* O_DIRECT writes */
 +
++struct dio_write {
++	struct kiocb			*req;
++	struct address_space		*mapping;
++	struct bch_inode_info		*inode;
++	struct mm_struct		*mm;
++	unsigned			loop:1,
++					extending:1,
++					sync:1,
++					flush:1,
++					free_iov:1;
++	struct quota_res		quota_res;
++	u64				written;
++
++	struct iov_iter			iter;
++	struct iovec			inline_vecs[2];
++
++	/* must be last: */
++	struct bch_write_op		op;
++};
++
 +static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
 +				       u64 offset, u64 size,
 +				       unsigned nr_replicas, bool compressed)
@@ -53321,7 +52836,8 @@ index 000000000..6b691b2b5
 +		if (ret) {
 +			dio->op.error = ret;
 +		} else {
-+			bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
++			bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq,
++						     &dio->op.cl);
 +			bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
 +		}
 +	}
@@ -53387,7 +52903,7 @@ index 000000000..6b691b2b5
 +
 +	if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
 +		mutex_lock(&inode->ei_quota_lock);
-+		__i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
++		__bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
 +		__bch2_quota_reservation_put(c, inode, &dio->quota_res);
 +		mutex_unlock(&inode->ei_quota_lock);
 +	}
@@ -53436,7 +52952,7 @@ index 000000000..6b691b2b5
 +			goto err;
 +
 +		if (unlikely(dropped_locks)) {
-+			ret = write_invalidate_inode_pages_range(mapping,
++			ret = bch2_write_invalidate_inode_pages_range(mapping,
 +					req->ki_pos,
 +					req->ki_pos + iter_count - 1);
 +			if (unlikely(ret))
@@ -53542,7 +53058,6 @@ index 000000000..6b691b2b5
 +		bch2_dio_write_continue(dio);
 +}
 +
-+static noinline
 +ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 +{
 +	struct file *file = req->ki_filp;
@@ -53606,7 +53121,7 @@ index 000000000..6b691b2b5
 +	dio->op.c		= c;
 +
 +	if (unlikely(mapping->nrpages)) {
-+		ret = write_invalidate_inode_pages_range(mapping,
++		ret = bch2_write_invalidate_inode_pages_range(mapping,
 +						req->ki_pos,
 +						req->ki_pos + iter->count - 1);
 +		if (unlikely(ret))
@@ -53625,44 +53140,1196 @@ index 000000000..6b691b2b5
 +	goto err;
 +}
 +
-+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
++void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
 +{
-+	struct file *file = iocb->ki_filp;
-+	struct bch_inode_info *inode = file_bch_inode(file);
-+	ssize_t ret;
++	bioset_exit(&c->dio_write_bioset);
++	bioset_exit(&c->dio_read_bioset);
++}
 +
-+	if (iocb->ki_flags & IOCB_DIRECT) {
-+		ret = bch2_direct_write(iocb, from);
++int bch2_fs_fs_io_direct_init(struct bch_fs *c)
++{
++	if (bioset_init(&c->dio_read_bioset,
++			4, offsetof(struct dio_read, rbio.bio),
++			BIOSET_NEED_BVECS))
++		return -BCH_ERR_ENOMEM_dio_read_bioset_init;
++
++	if (bioset_init(&c->dio_write_bioset,
++			4, offsetof(struct dio_write, op.wbio.bio),
++			BIOSET_NEED_BVECS))
++		return -BCH_ERR_ENOMEM_dio_write_bioset_init;
++
++	return 0;
++}
++
++#endif /* NO_BCACHEFS_FS */
+diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h
+new file mode 100644
+index 000000000..814621ec7
+--- /dev/null
++++ b/fs/bcachefs/fs-io-direct.h
+@@ -0,0 +1,16 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_FS_IO_DIRECT_H
++#define _BCACHEFS_FS_IO_DIRECT_H
++
++#ifndef NO_BCACHEFS_FS
++ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *);
++ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
++
++void bch2_fs_fs_io_direct_exit(struct bch_fs *);
++int bch2_fs_fs_io_direct_init(struct bch_fs *);
++#else
++static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {}
++static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; }
++#endif
++
++#endif /* _BCACHEFS_FS_IO_DIRECT_H */
+diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
+new file mode 100644
+index 000000000..1e60eead2
+--- /dev/null
++++ b/fs/bcachefs/fs-io-pagecache.c
+@@ -0,0 +1,788 @@
++// SPDX-License-Identifier: GPL-2.0
++#ifndef NO_BCACHEFS_FS
++
++#include "bcachefs.h"
++#include "btree_iter.h"
++#include "extents.h"
++#include "fs-io.h"
++#include "fs-io-pagecache.h"
++#include "subvolume.h"
++
++#include <linux/pagevec.h>
++#include <linux/writeback.h>
++
++int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
++				     loff_t start, u64 end,
++				     int fgp_flags, gfp_t gfp,
++				     folios *folios)
++{
++	struct folio *f;
++	u64 pos = start;
++	int ret = 0;
++
++	while (pos < end) {
++		if ((u64) pos >= (u64) start + (1ULL << 20))
++			fgp_flags &= ~FGP_CREAT;
++
++		ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
++		if (ret)
++			break;
++
++		f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
++		if (IS_ERR_OR_NULL(f))
++			break;
++
++		BUG_ON(folios->nr && folio_pos(f) != pos);
++
++		pos = folio_end_pos(f);
++		darray_push(folios, f);
++	}
++
++	if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
++		ret = -ENOMEM;
++
++	return folios->nr ? 0 : ret;
++}
++
++/* pagecache_block must be held */
++int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
++					    loff_t start, loff_t end)
++{
++	int ret;
++
++	/*
++	 * XXX: the way this is currently implemented, we can spin if a process
++	 * is continually redirtying a specific page
++	 */
++	do {
++		if (!mapping->nrpages)
++			return 0;
++
++		ret = filemap_write_and_wait_range(mapping, start, end);
++		if (ret)
++			break;
++
++		if (!mapping->nrpages)
++			return 0;
++
++		ret = invalidate_inode_pages2_range(mapping,
++				start >> PAGE_SHIFT,
++				end >> PAGE_SHIFT);
++	} while (ret == -EBUSY);
++
++	return ret;
++}
++
++static const char * const bch2_folio_sector_states[] = {
++#define x(n)	#n,
++	BCH_FOLIO_SECTOR_STATE()
++#undef x
++	NULL
++};
++
++static inline enum bch_folio_sector_state
++folio_sector_dirty(enum bch_folio_sector_state state)
++{
++	switch (state) {
++	case SECTOR_unallocated:
++		return SECTOR_dirty;
++	case SECTOR_reserved:
++		return SECTOR_dirty_reserved;
++	default:
++		return state;
++	}
++}
++
++static inline enum bch_folio_sector_state
++folio_sector_undirty(enum bch_folio_sector_state state)
++{
++	switch (state) {
++	case SECTOR_dirty:
++		return SECTOR_unallocated;
++	case SECTOR_dirty_reserved:
++		return SECTOR_reserved;
++	default:
++		return state;
++	}
++}
++
++static inline enum bch_folio_sector_state
++folio_sector_reserve(enum bch_folio_sector_state state)
++{
++	switch (state) {
++	case SECTOR_unallocated:
++		return SECTOR_reserved;
++	case SECTOR_dirty:
++		return SECTOR_dirty_reserved;
++	default:
++		return state;
++	}
++}
++
++/* for newly allocated folios: */
++struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
++{
++	struct bch_folio *s;
++
++	s = kzalloc(sizeof(*s) +
++		    sizeof(struct bch_folio_sector) *
++		    folio_sectors(folio), gfp);
++	if (!s)
++		return NULL;
++
++	spin_lock_init(&s->lock);
++	folio_attach_private(folio, s);
++	return s;
++}
++
++struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
++{
++	return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
++}
++
++static unsigned bkey_to_sector_state(struct bkey_s_c k)
++{
++	if (bkey_extent_is_reservation(k))
++		return SECTOR_reserved;
++	if (bkey_extent_is_allocation(k.k))
++		return SECTOR_allocated;
++	return SECTOR_unallocated;
++}
++
++static void __bch2_folio_set(struct folio *folio,
++			     unsigned pg_offset, unsigned pg_len,
++			     unsigned nr_ptrs, unsigned state)
++{
++	struct bch_folio *s = bch2_folio(folio);
++	unsigned i, sectors = folio_sectors(folio);
++
++	BUG_ON(pg_offset >= sectors);
++	BUG_ON(pg_offset + pg_len > sectors);
++
++	spin_lock(&s->lock);
++
++	for (i = pg_offset; i < pg_offset + pg_len; i++) {
++		s->s[i].nr_replicas	= nr_ptrs;
++		bch2_folio_sector_set(folio, s, i, state);
++	}
++
++	if (i == sectors)
++		s->uptodate = true;
++
++	spin_unlock(&s->lock);
++}
++
++/*
++ * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
++ * extents btree:
++ */
++int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
++		   struct folio **folios, unsigned nr_folios)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_folio *s;
++	u64 offset = folio_sector(folios[0]);
++	unsigned folio_idx;
++	u32 snapshot;
++	bool need_set = false;
++	int ret;
++
++	for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
++		s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
++		if (!s)
++			return -ENOMEM;
++
++		need_set |= !s->uptodate;
++	}
++
++	if (!need_set)
++		return 0;
++
++	folio_idx = 0;
++	bch2_trans_init(&trans, c, 0, 0);
++retry:
++	bch2_trans_begin(&trans);
++
++	ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
++	if (ret)
++		goto err;
++
++	for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
++			   SPOS(inum.inum, offset, snapshot),
++			   BTREE_ITER_SLOTS, k, ret) {
++		unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
++		unsigned state = bkey_to_sector_state(k);
++
++		while (folio_idx < nr_folios) {
++			struct folio *folio = folios[folio_idx];
++			u64 folio_start	= folio_sector(folio);
++			u64 folio_end	= folio_end_sector(folio);
++			unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
++				folio_start;
++			unsigned folio_len = min(k.k->p.offset, folio_end) -
++				folio_offset - folio_start;
++
++			BUG_ON(k.k->p.offset < folio_start);
++			BUG_ON(bkey_start_offset(k.k) > folio_end);
++
++			if (!bch2_folio(folio)->uptodate)
++				__bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
++
++			if (k.k->p.offset < folio_end)
++				break;
++			folio_idx++;
++		}
++
++		if (folio_idx == nr_folios)
++			break;
++	}
++
++	offset = iter.pos.offset;
++	bch2_trans_iter_exit(&trans, &iter);
++err:
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++	bch2_trans_exit(&trans);
++
++	return ret;
++}
++
++void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
++{
++	struct bvec_iter iter;
++	struct folio_vec fv;
++	unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
++		? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
++	unsigned state = bkey_to_sector_state(k);
++
++	bio_for_each_folio(fv, bio, iter)
++		__bch2_folio_set(fv.fv_folio,
++				 fv.fv_offset >> 9,
++				 fv.fv_len >> 9,
++				 nr_ptrs, state);
++}
++
++void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
++				     u64 start, u64 end)
++{
++	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
++	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
++	struct folio_batch fbatch;
++	unsigned i, j;
++
++	if (end <= start)
++		return;
++
++	folio_batch_init(&fbatch);
++
++	while (filemap_get_folios(inode->v.i_mapping,
++				  &index, end_index, &fbatch)) {
++		for (i = 0; i < folio_batch_count(&fbatch); i++) {
++			struct folio *folio = fbatch.folios[i];
++			u64 folio_start = folio_sector(folio);
++			u64 folio_end = folio_end_sector(folio);
++			unsigned folio_offset = max(start, folio_start) - folio_start;
++			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
++			struct bch_folio *s;
++
++			BUG_ON(end <= folio_start);
++
++			folio_lock(folio);
++			s = bch2_folio(folio);
++
++			if (s) {
++				spin_lock(&s->lock);
++				for (j = folio_offset; j < folio_offset + folio_len; j++)
++					s->s[j].nr_replicas = 0;
++				spin_unlock(&s->lock);
++			}
++
++			folio_unlock(folio);
++		}
++		folio_batch_release(&fbatch);
++		cond_resched();
++	}
++}
++
++void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
++				  u64 start, u64 end)
++{
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	pgoff_t index = start >> PAGE_SECTORS_SHIFT;
++	pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
++	struct folio_batch fbatch;
++	s64 i_sectors_delta = 0;
++	unsigned i, j;
++
++	if (end <= start)
++		return;
++
++	folio_batch_init(&fbatch);
++
++	while (filemap_get_folios(inode->v.i_mapping,
++				  &index, end_index, &fbatch)) {
++		for (i = 0; i < folio_batch_count(&fbatch); i++) {
++			struct folio *folio = fbatch.folios[i];
++			u64 folio_start = folio_sector(folio);
++			u64 folio_end = folio_end_sector(folio);
++			unsigned folio_offset = max(start, folio_start) - folio_start;
++			unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
++			struct bch_folio *s;
++
++			BUG_ON(end <= folio_start);
++
++			folio_lock(folio);
++			s = bch2_folio(folio);
++
++			if (s) {
++				spin_lock(&s->lock);
++				for (j = folio_offset; j < folio_offset + folio_len; j++) {
++					i_sectors_delta -= s->s[j].state == SECTOR_dirty;
++					bch2_folio_sector_set(folio, s, j,
++						folio_sector_reserve(s->s[j].state));
++				}
++				spin_unlock(&s->lock);
++			}
++
++			folio_unlock(folio);
++		}
++		folio_batch_release(&fbatch);
++		cond_resched();
++	}
++
++	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
++}
++
++static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
++					  unsigned nr_replicas)
++{
++	return max(0, (int) nr_replicas -
++		   s->nr_replicas -
++		   s->replicas_reserved);
++}
++
++int bch2_get_folio_disk_reservation(struct bch_fs *c,
++				struct bch_inode_info *inode,
++				struct folio *folio, bool check_enospc)
++{
++	struct bch_folio *s = bch2_folio_create(folio, 0);
++	unsigned nr_replicas = inode_nr_replicas(c, inode);
++	struct disk_reservation disk_res = { 0 };
++	unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
++	int ret;
++
++	if (!s)
++		return -ENOMEM;
++
++	for (i = 0; i < sectors; i++)
++		disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
++
++	if (!disk_res_sectors)
++		return 0;
++
++	ret = bch2_disk_reservation_get(c, &disk_res,
++					disk_res_sectors, 1,
++					!check_enospc
++					? BCH_DISK_RESERVATION_NOFAIL
++					: 0);
++	if (unlikely(ret))
++		return ret;
++
++	for (i = 0; i < sectors; i++)
++		s->s[i].replicas_reserved +=
++			sectors_to_reserve(&s->s[i], nr_replicas);
++
++	return 0;
++}
++
++void bch2_folio_reservation_put(struct bch_fs *c,
++			struct bch_inode_info *inode,
++			struct bch2_folio_reservation *res)
++{
++	bch2_disk_reservation_put(c, &res->disk);
++	bch2_quota_reservation_put(c, inode, &res->quota);
++}
++
++int bch2_folio_reservation_get(struct bch_fs *c,
++			struct bch_inode_info *inode,
++			struct folio *folio,
++			struct bch2_folio_reservation *res,
++			unsigned offset, unsigned len)
++{
++	struct bch_folio *s = bch2_folio_create(folio, 0);
++	unsigned i, disk_sectors = 0, quota_sectors = 0;
++	int ret;
++
++	if (!s)
++		return -ENOMEM;
++
++	BUG_ON(!s->uptodate);
++
++	for (i = round_down(offset, block_bytes(c)) >> 9;
++	     i < round_up(offset + len, block_bytes(c)) >> 9;
++	     i++) {
++		disk_sectors += sectors_to_reserve(&s->s[i],
++						res->disk.nr_replicas);
++		quota_sectors += s->s[i].state == SECTOR_unallocated;
++	}
++
++	if (disk_sectors) {
++		ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
++		if (unlikely(ret))
++			return ret;
++	}
++
++	if (quota_sectors) {
++		ret = bch2_quota_reservation_add(c, inode, &res->quota,
++						 quota_sectors, true);
++		if (unlikely(ret)) {
++			struct disk_reservation tmp = {
++				.sectors = disk_sectors
++			};
++
++			bch2_disk_reservation_put(c, &tmp);
++			res->disk.sectors -= disk_sectors;
++			return ret;
++		}
++	}
++
++	return 0;
++}
++
++static void bch2_clear_folio_bits(struct folio *folio)
++{
++	struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch_folio *s = bch2_folio(folio);
++	struct disk_reservation disk_res = { 0 };
++	int i, sectors = folio_sectors(folio), dirty_sectors = 0;
++
++	if (!s)
++		return;
++
++	EBUG_ON(!folio_test_locked(folio));
++	EBUG_ON(folio_test_writeback(folio));
++
++	for (i = 0; i < sectors; i++) {
++		disk_res.sectors += s->s[i].replicas_reserved;
++		s->s[i].replicas_reserved = 0;
++
++		dirty_sectors -= s->s[i].state == SECTOR_dirty;
++		bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
++	}
++
++	bch2_disk_reservation_put(c, &disk_res);
++
++	bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
++
++	bch2_folio_release(folio);
++}
++
++void bch2_set_folio_dirty(struct bch_fs *c,
++			  struct bch_inode_info *inode,
++			  struct folio *folio,
++			  struct bch2_folio_reservation *res,
++			  unsigned offset, unsigned len)
++{
++	struct bch_folio *s = bch2_folio(folio);
++	unsigned i, dirty_sectors = 0;
++
++	WARN_ON((u64) folio_pos(folio) + offset + len >
++		round_up((u64) i_size_read(&inode->v), block_bytes(c)));
++
++	BUG_ON(!s->uptodate);
++
++	spin_lock(&s->lock);
++
++	for (i = round_down(offset, block_bytes(c)) >> 9;
++	     i < round_up(offset + len, block_bytes(c)) >> 9;
++	     i++) {
++		unsigned sectors = sectors_to_reserve(&s->s[i],
++						res->disk.nr_replicas);
++
++		/*
++		 * This can happen if we race with the error path in
++		 * bch2_writepage_io_done():
++		 */
++		sectors = min_t(unsigned, sectors, res->disk.sectors);
++
++		s->s[i].replicas_reserved += sectors;
++		res->disk.sectors -= sectors;
++
++		dirty_sectors += s->s[i].state == SECTOR_unallocated;
++
++		bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
++	}
++
++	spin_unlock(&s->lock);
++
++	bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
++
++	if (!folio_test_dirty(folio))
++		filemap_dirty_folio(inode->v.i_mapping, folio);
++}
++
++vm_fault_t bch2_page_fault(struct vm_fault *vmf)
++{
++	struct file *file = vmf->vma->vm_file;
++	struct address_space *mapping = file->f_mapping;
++	struct address_space *fdm = faults_disabled_mapping();
++	struct bch_inode_info *inode = file_bch_inode(file);
++	vm_fault_t ret;
++
++	if (fdm == mapping)
++		return VM_FAULT_SIGBUS;
++
++	/* Lock ordering: */
++	if (fdm > mapping) {
++		struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
++
++		if (bch2_pagecache_add_tryget(inode))
++			goto got_lock;
++
++		bch2_pagecache_block_put(fdm_host);
++
++		bch2_pagecache_add_get(inode);
++		bch2_pagecache_add_put(inode);
++
++		bch2_pagecache_block_get(fdm_host);
++
++		/* Signal that lock has been dropped: */
++		set_fdm_dropped_locks();
++		return VM_FAULT_SIGBUS;
++	}
++
++	bch2_pagecache_add_get(inode);
++got_lock:
++	ret = filemap_fault(vmf);
++	bch2_pagecache_add_put(inode);
++
++	return ret;
++}
++
++vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
++{
++	struct folio *folio = page_folio(vmf->page);
++	struct file *file = vmf->vma->vm_file;
++	struct bch_inode_info *inode = file_bch_inode(file);
++	struct address_space *mapping = file->f_mapping;
++	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	struct bch2_folio_reservation res;
++	unsigned len;
++	loff_t isize;
++	vm_fault_t ret;
++
++	bch2_folio_reservation_init(c, inode, &res);
++
++	sb_start_pagefault(inode->v.i_sb);
++	file_update_time(file);
++
++	/*
++	 * Not strictly necessary, but helps avoid dio writes livelocking in
++	 * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
++	 * a bch2_write_invalidate_inode_pages_range() that works without dropping
++	 * page lock before invalidating page
++	 */
++	bch2_pagecache_add_get(inode);
++
++	folio_lock(folio);
++	isize = i_size_read(&inode->v);
++
++	if (folio->mapping != mapping || folio_pos(folio) >= isize) {
++		folio_unlock(folio);
++		ret = VM_FAULT_NOPAGE;
 +		goto out;
 +	}
 +
-+	/* We can write back this queue in page reclaim */
-+	current->backing_dev_info = inode_to_bdi(&inode->v);
-+	inode_lock(&inode->v);
++	len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
 +
-+	ret = generic_write_checks(iocb, from);
-+	if (ret <= 0)
-+		goto unlock;
++	if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
++	    bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
++		folio_unlock(folio);
++		ret = VM_FAULT_SIGBUS;
++		goto out;
++	}
 +
-+	ret = file_remove_privs(file);
-+	if (ret)
-+		goto unlock;
++	bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
++	bch2_folio_reservation_put(c, inode, &res);
 +
-+	ret = file_update_time(file);
-+	if (ret)
-+		goto unlock;
-+
-+	ret = bch2_buffered_write(iocb, from);
-+	if (likely(ret > 0))
-+		iocb->ki_pos += ret;
-+unlock:
-+	inode_unlock(&inode->v);
-+	current->backing_dev_info = NULL;
-+
-+	if (ret > 0)
-+		ret = generic_write_sync(iocb, ret);
++	folio_wait_stable(folio);
++	ret = VM_FAULT_LOCKED;
 +out:
-+	return bch2_err_class(ret);
++	bch2_pagecache_add_put(inode);
++	sb_end_pagefault(inode->v.i_sb);
++
++	return ret;
++}
++
++void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
++{
++	if (offset || length < folio_size(folio))
++		return;
++
++	bch2_clear_folio_bits(folio);
++}
++
++bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
++{
++	if (folio_test_dirty(folio) || folio_test_writeback(folio))
++		return false;
++
++	bch2_clear_folio_bits(folio);
++	return true;
++}
++
++/* fseek: */
++
++static int folio_data_offset(struct folio *folio, loff_t pos,
++			     unsigned min_replicas)
++{
++	struct bch_folio *s = bch2_folio(folio);
++	unsigned i, sectors = folio_sectors(folio);
++
++	if (s)
++		for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
++			if (s->s[i].state >= SECTOR_dirty &&
++			    s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
++				return i << SECTOR_SHIFT;
++
++	return -1;
++}
++
++loff_t bch2_seek_pagecache_data(struct inode *vinode,
++				loff_t start_offset,
++				loff_t end_offset,
++				unsigned min_replicas,
++				bool nonblock)
++{
++	struct folio_batch fbatch;
++	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
++	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
++	pgoff_t index		= start_index;
++	unsigned i;
++	loff_t ret;
++	int offset;
++
++	folio_batch_init(&fbatch);
++
++	while (filemap_get_folios(vinode->i_mapping,
++				  &index, end_index, &fbatch)) {
++		for (i = 0; i < folio_batch_count(&fbatch); i++) {
++			struct folio *folio = fbatch.folios[i];
++
++			if (!nonblock) {
++				folio_lock(folio);
++			} else if (!folio_trylock(folio)) {
++				folio_batch_release(&fbatch);
++				return -EAGAIN;
++			}
++
++			offset = folio_data_offset(folio,
++					max(folio_pos(folio), start_offset),
++					min_replicas);
++			if (offset >= 0) {
++				ret = clamp(folio_pos(folio) + offset,
++					    start_offset, end_offset);
++				folio_unlock(folio);
++				folio_batch_release(&fbatch);
++				return ret;
++			}
++			folio_unlock(folio);
++		}
++		folio_batch_release(&fbatch);
++		cond_resched();
++	}
++
++	return end_offset;
++}
++
++/*
++ * Search for a hole in a folio.
++ *
++ * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
++ * code to indicate a pagecache hole exists at the returned offset. Otherwise
++ * return 0 if the folio is filled with data, or an error code. This function
++ * can return -EAGAIN if nonblock is specified.
++ */
++static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
++			      unsigned min_replicas, bool nonblock)
++{
++	struct folio *folio;
++	struct bch_folio *s;
++	unsigned i, sectors;
++	int ret = -ENOENT;
++
++	folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
++				    FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
++	if (IS_ERR(folio))
++		return PTR_ERR(folio);
++
++	s = bch2_folio(folio);
++	if (!s)
++		goto unlock;
++
++	sectors = folio_sectors(folio);
++	for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
++		if (s->s[i].state < SECTOR_dirty ||
++		    s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
++			*offset = max(*offset,
++				      folio_pos(folio) + (i << SECTOR_SHIFT));
++			goto unlock;
++		}
++
++	*offset = folio_end_pos(folio);
++	ret = 0;
++unlock:
++	folio_unlock(folio);
++	folio_put(folio);
++	return ret;
++}
++
++loff_t bch2_seek_pagecache_hole(struct inode *vinode,
++				loff_t start_offset,
++				loff_t end_offset,
++				unsigned min_replicas,
++				bool nonblock)
++{
++	struct address_space *mapping = vinode->i_mapping;
++	loff_t offset = start_offset;
++	loff_t ret = 0;
++
++	while (!ret && offset < end_offset)
++		ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock);
++
++	if (ret && ret != -ENOENT)
++		return ret;
++	return min(offset, end_offset);
++}
++
++int bch2_clamp_data_hole(struct inode *inode,
++			 u64 *hole_start,
++			 u64 *hole_end,
++			 unsigned min_replicas,
++			 bool nonblock)
++{
++	loff_t ret;
++
++	ret = bch2_seek_pagecache_hole(inode,
++		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
++	if (ret < 0)
++		return ret;
++
++	*hole_start = ret;
++
++	if (*hole_start == *hole_end)
++		return 0;
++
++	ret = bch2_seek_pagecache_data(inode,
++		*hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
++	if (ret < 0)
++		return ret;
++
++	*hole_end = ret;
++	return 0;
++}
++
++#endif /* NO_BCACHEFS_FS */
+diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
+new file mode 100644
+index 000000000..a2222ad58
+--- /dev/null
++++ b/fs/bcachefs/fs-io-pagecache.h
+@@ -0,0 +1,176 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_FS_IO_PAGECACHE_H
++#define _BCACHEFS_FS_IO_PAGECACHE_H
++
++#include <linux/pagemap.h>
++
++typedef DARRAY(struct folio *) folios;
++
++int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
++				     u64, int, gfp_t, folios *);
++int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
++
++/*
++ * Use u64 for the end pos and sector helpers because if the folio covers the
++ * max supported range of the mapping, the start offset of the next folio
++ * overflows loff_t. This breaks much of the range based processing in the
++ * buffered write path.
++ */
++static inline u64 folio_end_pos(struct folio *folio)
++{
++	return folio_pos(folio) + folio_size(folio);
++}
++
++static inline size_t folio_sectors(struct folio *folio)
++{
++	return PAGE_SECTORS << folio_order(folio);
++}
++
++static inline loff_t folio_sector(struct folio *folio)
++{
++	return folio_pos(folio) >> 9;
++}
++
++static inline u64 folio_end_sector(struct folio *folio)
++{
++	return folio_end_pos(folio) >> 9;
++}
++
++#define BCH_FOLIO_SECTOR_STATE()	\
++	x(unallocated)			\
++	x(reserved)			\
++	x(dirty)			\
++	x(dirty_reserved)		\
++	x(allocated)
++
++enum bch_folio_sector_state {
++#define x(n)	SECTOR_##n,
++	BCH_FOLIO_SECTOR_STATE()
++#undef x
++};
++
++struct bch_folio_sector {
++	/* Uncompressed, fully allocated replicas (or on disk reservation): */
++	unsigned		nr_replicas:4;
++
++	/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
++	unsigned		replicas_reserved:4;
++
++	/* i_sectors: */
++	enum bch_folio_sector_state state:8;
++};
++
++struct bch_folio {
++	spinlock_t		lock;
++	atomic_t		write_count;
++	/*
++	 * Is the sector state up to date with the btree?
++	 * (Not the data itself)
++	 */
++	bool			uptodate;
++	struct bch_folio_sector	s[];
++};
++
++/* Helper for when we need to add debug instrumentation: */
++static inline void bch2_folio_sector_set(struct folio *folio,
++			     struct bch_folio *s,
++			     unsigned i, unsigned n)
++{
++	s->s[i].state = n;
++}
++
++/* file offset (to folio offset) to bch_folio_sector index */
++static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
++{
++	u64 f_offset = pos - folio_pos(folio);
++
++	BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
++	return f_offset >> SECTOR_SHIFT;
++}
++
++/* for newly allocated folios: */
++static inline void __bch2_folio_release(struct folio *folio)
++{
++	kfree(folio_detach_private(folio));
++}
++
++static inline void bch2_folio_release(struct folio *folio)
++{
++	EBUG_ON(!folio_test_locked(folio));
++	__bch2_folio_release(folio);
++}
++
++static inline struct bch_folio *__bch2_folio(struct folio *folio)
++{
++	return folio_has_private(folio)
++		? (struct bch_folio *) folio_get_private(folio)
++		: NULL;
++}
++
++static inline struct bch_folio *bch2_folio(struct folio *folio)
++{
++	EBUG_ON(!folio_test_locked(folio));
++
++	return __bch2_folio(folio);
++}
++
++struct bch_folio *__bch2_folio_create(struct folio *, gfp_t);
++struct bch_folio *bch2_folio_create(struct folio *, gfp_t);
++
++struct bch2_folio_reservation {
++	struct disk_reservation	disk;
++	struct quota_res	quota;
++};
++
++static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
++{
++	/* XXX: this should not be open coded */
++	return inode->ei_inode.bi_data_replicas
++		? inode->ei_inode.bi_data_replicas - 1
++		: c->opts.data_replicas;
++}
++
++static inline void bch2_folio_reservation_init(struct bch_fs *c,
++			struct bch_inode_info *inode,
++			struct bch2_folio_reservation *res)
++{
++	memset(res, 0, sizeof(*res));
++
++	res->disk.nr_replicas = inode_nr_replicas(c, inode);
++}
++
++int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
++void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
++
++void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
++void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
++
++int bch2_get_folio_disk_reservation(struct bch_fs *,
++				struct bch_inode_info *,
++				struct folio *, bool);
++
++void bch2_folio_reservation_put(struct bch_fs *,
++			struct bch_inode_info *,
++			struct bch2_folio_reservation *);
++int bch2_folio_reservation_get(struct bch_fs *,
++			struct bch_inode_info *,
++			struct folio *,
++			struct bch2_folio_reservation *,
++			unsigned, unsigned);
++
++void bch2_set_folio_dirty(struct bch_fs *,
++			  struct bch_inode_info *,
++			  struct folio *,
++			  struct bch2_folio_reservation *,
++			  unsigned, unsigned);
++
++vm_fault_t bch2_page_fault(struct vm_fault *);
++vm_fault_t bch2_page_mkwrite(struct vm_fault *);
++void bch2_invalidate_folio(struct folio *, size_t, size_t);
++bool bch2_release_folio(struct folio *, gfp_t);
++
++loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool);
++loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool);
++int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
++
++#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */
+diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
+new file mode 100644
+index 000000000..ceab12fb8
+--- /dev/null
++++ b/fs/bcachefs/fs-io.c
+@@ -0,0 +1,1250 @@
++// SPDX-License-Identifier: GPL-2.0
++#ifndef NO_BCACHEFS_FS
++
++#include "bcachefs.h"
++#include "alloc_foreground.h"
++#include "btree_update.h"
++#include "buckets.h"
++#include "clock.h"
++#include "error.h"
++#include "extents.h"
++#include "extent_update.h"
++#include "fs.h"
++#include "fs-io.h"
++#include "fs-io-buffered.h"
++#include "fs-io-pagecache.h"
++#include "fsck.h"
++#include "inode.h"
++#include "journal.h"
++#include "io.h"
++#include "keylist.h"
++#include "quota.h"
++#include "reflink.h"
++#include "trace.h"
++
++#include <linux/aio.h>
++#include <linux/backing-dev.h>
++#include <linux/falloc.h>
++#include <linux/migrate.h>
++#include <linux/mmu_context.h>
++#include <linux/pagevec.h>
++#include <linux/rmap.h>
++#include <linux/sched/signal.h>
++#include <linux/task_io_accounting_ops.h>
++#include <linux/uio.h>
++
++#include <trace/events/writeback.h>
++
++struct nocow_flush {
++	struct closure	*cl;
++	struct bch_dev	*ca;
++	struct bio	bio;
++};
++
++static void nocow_flush_endio(struct bio *_bio)
++{
++
++	struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
++
++	closure_put(bio->cl);
++	percpu_ref_put(&bio->ca->io_ref);
++	bio_put(&bio->bio);
++}
++
++void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
++					 struct bch_inode_info *inode,
++					 struct closure *cl)
++{
++	struct nocow_flush *bio;
++	struct bch_dev *ca;
++	struct bch_devs_mask devs;
++	unsigned dev;
++
++	dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX);
++	if (dev == BCH_SB_MEMBERS_MAX)
++		return;
++
++	devs = inode->ei_devs_need_flush;
++	memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
++
++	for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
++		rcu_read_lock();
++		ca = rcu_dereference(c->devs[dev]);
++		if (ca && !percpu_ref_tryget(&ca->io_ref))
++			ca = NULL;
++		rcu_read_unlock();
++
++		if (!ca)
++			continue;
++
++		bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
++						    REQ_OP_FLUSH,
++						    GFP_KERNEL,
++						    &c->nocow_flush_bioset),
++				   struct nocow_flush, bio);
++		bio->cl			= cl;
++		bio->ca			= ca;
++		bio->bio.bi_end_io	= nocow_flush_endio;
++		closure_bio_submit(&bio->bio, cl);
++	}
++}
++
++static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
++					 struct bch_inode_info *inode)
++{
++	struct closure cl;
++
++	closure_init_stack(&cl);
++	bch2_inode_flush_nocow_writes_async(c, inode, &cl);
++	closure_sync(&cl);
++
++	return 0;
++}
++
++/* i_size updates: */
++
++struct inode_new_size {
++	loff_t		new_size;
++	u64		now;
++	unsigned	fields;
++};
++
++static int inode_set_size(struct btree_trans *trans,
++			  struct bch_inode_info *inode,
++			  struct bch_inode_unpacked *bi,
++			  void *p)
++{
++	struct inode_new_size *s = p;
++
++	bi->bi_size = s->new_size;
++	if (s->fields & ATTR_ATIME)
++		bi->bi_atime = s->now;
++	if (s->fields & ATTR_MTIME)
++		bi->bi_mtime = s->now;
++	if (s->fields & ATTR_CTIME)
++		bi->bi_ctime = s->now;
++
++	return 0;
++}
++
++int __must_check bch2_write_inode_size(struct bch_fs *c,
++				       struct bch_inode_info *inode,
++				       loff_t new_size, unsigned fields)
++{
++	struct inode_new_size s = {
++		.new_size	= new_size,
++		.now		= bch2_current_time(c),
++		.fields		= fields,
++	};
++
++	return bch2_write_inode(c, inode, inode_set_size, &s, fields);
++}
++
++void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
++			   struct quota_res *quota_res, s64 sectors)
++{
++	bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
++				"inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
++				inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
++				inode->ei_inode.bi_sectors);
++	inode->v.i_blocks += sectors;
++
++#ifdef CONFIG_BCACHEFS_QUOTA
++	if (quota_res &&
++	    !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
++	    sectors > 0) {
++		BUG_ON(sectors > quota_res->sectors);
++		BUG_ON(sectors > inode->ei_quota_reserved);
++
++		quota_res->sectors -= sectors;
++		inode->ei_quota_reserved -= sectors;
++	} else {
++		bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
++	}
++#endif
 +}
 +
 +/* fsync: */
@@ -53763,7 +54430,7 @@ index 000000000..6b691b2b5
 +
 +		folio = __filemap_get_folio(mapping, index,
 +					    FGP_LOCK|FGP_CREAT, GFP_KERNEL);
-+		if (unlikely(IS_ERR_OR_NULL(folio))) {
++		if (IS_ERR_OR_NULL(folio)) {
 +			ret = -ENOMEM;
 +			goto out;
 +		}
@@ -53804,10 +54471,10 @@ index 000000000..6b691b2b5
 +		s->s[i].nr_replicas	= 0;
 +
 +		i_sectors_delta -= s->s[i].state == SECTOR_dirty;
-+		folio_sector_set(folio, s, i, SECTOR_unallocated);
++		bch2_folio_sector_set(folio, s, i, SECTOR_unallocated);
 +	}
 +
-+	i_sectors_acct(c, inode, NULL, i_sectors_delta);
++	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 +
 +	/*
 +	 * Caller needs to know whether this folio will be written out by
@@ -53891,7 +54558,8 @@ index 000000000..6b691b2b5
 +	return bch2_setattr_nonsize(idmap, inode, iattr);
 +}
 +
-+static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
++static int bch2_truncate_finish_fn(struct btree_trans *trans,
++				   struct bch_inode_info *inode,
 +				   struct bch_inode_unpacked *bi,
 +				   void *p)
 +{
@@ -53899,7 +54567,8 @@ index 000000000..6b691b2b5
 +	return 0;
 +}
 +
-+static int bch2_truncate_start_fn(struct bch_inode_info *inode,
++static int bch2_truncate_start_fn(struct btree_trans *trans,
++				  struct bch_inode_info *inode,
 +				  struct bch_inode_unpacked *bi, void *p)
 +{
 +	u64 *new_i_size = p;
@@ -53998,7 +54667,7 @@ index 000000000..6b691b2b5
 +	ret = bch2_fpunch(c, inode_inum(inode),
 +			round_up(iattr->ia_size, block_bytes(c)) >> 9,
 +			U64_MAX, &i_sectors_delta);
-+	i_sectors_acct(c, inode, NULL, i_sectors_delta);
++	bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 +
 +	bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
 +				!bch2_journal_error(&c->journal), c,
@@ -54020,7 +54689,8 @@ index 000000000..6b691b2b5
 +
 +/* fallocate: */
 +
-+static int inode_update_times_fn(struct bch_inode_info *inode,
++static int inode_update_times_fn(struct btree_trans *trans,
++				 struct bch_inode_info *inode,
 +				 struct bch_inode_unpacked *bi, void *p)
 +{
 +	struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -54052,7 +54722,7 @@ index 000000000..6b691b2b5
 +		ret = bch2_fpunch(c, inode_inum(inode),
 +				  block_start >> 9, block_end >> 9,
 +				  &i_sectors_delta);
-+		i_sectors_acct(c, inode, NULL, i_sectors_delta);
++		bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 +	}
 +
 +	mutex_lock(&inode->ei_update_lock);
@@ -54103,7 +54773,7 @@ index 000000000..6b691b2b5
 +
 +	new_size = inode->v.i_size + shift;
 +
-+	ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
++	ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
 +	if (ret)
 +		return ret;
 +
@@ -54119,7 +54789,7 @@ index 000000000..6b691b2b5
 +		ret = bch2_fpunch(c, inode_inum(inode),
 +				  offset >> 9, (offset + len) >> 9,
 +				  &i_sectors_delta);
-+		i_sectors_acct(c, inode, NULL, i_sectors_delta);
++		bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 +
 +		if (ret)
 +			return ret;
@@ -54303,11 +54973,19 @@ index 000000000..6b691b2b5
 +		}
 +
 +		if (!(mode & FALLOC_FL_ZERO_RANGE)) {
-+			ret = drop_locks_do(&trans,
-+				(bch2_clamp_data_hole(&inode->v,
-+						      &hole_start,
-+						      &hole_end,
-+						      opts.data_replicas), 0));
++			/*
++			 * Lock ordering - can't be holding btree locks while
++			 * blocking on a folio lock:
++			 */
++			if (bch2_clamp_data_hole(&inode->v,
++						 &hole_start,
++						 &hole_end,
++						 opts.data_replicas, true))
++				ret = drop_locks_do(&trans,
++					(bch2_clamp_data_hole(&inode->v,
++							      &hole_start,
++							      &hole_end,
++							      opts.data_replicas, false), 0));
 +			bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
 +
 +			if (ret)
@@ -54332,10 +55010,10 @@ index 000000000..6b691b2b5
 +		if (ret)
 +			goto bkey_err;
 +
-+		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
++		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 +
 +		drop_locks_do(&trans,
-+			(mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
++			(bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
 +bkey_err:
 +		bch2_quota_reservation_put(c, inode, &quota_res);
 +		if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -54348,7 +55026,7 @@ index 000000000..6b691b2b5
 +
 +		bch2_fpunch_at(&trans, &iter, inode_inum(inode),
 +			       end_sector, &i_sectors_delta);
-+		i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
++		bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 +		bch2_quota_reservation_put(c, inode, &quota_res);
 +	}
 +
@@ -54542,7 +55220,7 @@ index 000000000..6b691b2b5
 +
 +	aligned_len = round_up((u64) len, block_bytes(c));
 +
-+	ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
++	ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping,
 +				pos_dst, pos_dst + len - 1);
 +	if (ret)
 +		goto err;
@@ -54554,7 +55232,7 @@ index 000000000..6b691b2b5
 +
 +	file_update_time(file_dst);
 +
-+	mark_pagecache_unallocated(src, pos_src >> 9,
++	bch2_mark_pagecache_unallocated(src, pos_src >> 9,
 +				   (pos_src + aligned_len) >> 9);
 +
 +	ret = bch2_remap_range(c,
@@ -54570,7 +55248,7 @@ index 000000000..6b691b2b5
 +	 */
 +	ret = min((u64) ret << 9, (u64) len);
 +
-+	i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
++	bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
 +
 +	spin_lock(&dst->v.i_lock);
 +	if (pos_dst + ret > dst->v.i_size)
@@ -54589,61 +55267,6 @@ index 000000000..6b691b2b5
 +
 +/* fseek: */
 +
-+static int folio_data_offset(struct folio *folio, loff_t pos,
-+			     unsigned min_replicas)
-+{
-+	struct bch_folio *s = bch2_folio(folio);
-+	unsigned i, sectors = folio_sectors(folio);
-+
-+	if (s)
-+		for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
-+			if (s->s[i].state >= SECTOR_dirty &&
-+			    s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
-+				return i << SECTOR_SHIFT;
-+
-+	return -1;
-+}
-+
-+static loff_t bch2_seek_pagecache_data(struct inode *vinode,
-+				       loff_t start_offset,
-+				       loff_t end_offset,
-+				       unsigned min_replicas)
-+{
-+	struct folio_batch fbatch;
-+	pgoff_t start_index	= start_offset >> PAGE_SHIFT;
-+	pgoff_t end_index	= end_offset >> PAGE_SHIFT;
-+	pgoff_t index		= start_index;
-+	unsigned i;
-+	loff_t ret;
-+	int offset;
-+
-+	folio_batch_init(&fbatch);
-+
-+	while (filemap_get_folios(vinode->i_mapping,
-+				  &index, end_index, &fbatch)) {
-+		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-+			struct folio *folio = fbatch.folios[i];
-+
-+			folio_lock(folio);
-+			offset = folio_data_offset(folio,
-+					max(folio_pos(folio), start_offset),
-+					min_replicas);
-+			if (offset >= 0) {
-+				ret = clamp(folio_pos(folio) + offset,
-+					    start_offset, end_offset);
-+				folio_unlock(folio);
-+				folio_batch_release(&fbatch);
-+				return ret;
-+			}
-+			folio_unlock(folio);
-+		}
-+		folio_batch_release(&fbatch);
-+		cond_resched();
-+	}
-+
-+	return end_offset;
-+}
-+
 +static loff_t bch2_seek_data(struct file *file, u64 offset)
 +{
 +	struct bch_inode_info *inode = file_bch_inode(file);
@@ -54689,7 +55312,7 @@ index 000000000..6b691b2b5
 +
 +	if (next_data > offset)
 +		next_data = bch2_seek_pagecache_data(&inode->v,
-+						     offset, next_data, 0);
++					offset, next_data, 0, false);
 +
 +	if (next_data >= isize)
 +		return -ENXIO;
@@ -54697,68 +55320,6 @@ index 000000000..6b691b2b5
 +	return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
 +}
 +
-+static bool folio_hole_offset(struct address_space *mapping, loff_t *offset,
-+			      unsigned min_replicas)
-+{
-+	struct folio *folio;
-+	struct bch_folio *s;
-+	unsigned i, sectors;
-+	bool ret = true;
-+
-+	folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT);
-+	if (IS_ERR_OR_NULL(folio))
-+		return true;
-+
-+	s = bch2_folio(folio);
-+	if (!s)
-+		goto unlock;
-+
-+	sectors = folio_sectors(folio);
-+	for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
-+		if (s->s[i].state < SECTOR_dirty ||
-+		    s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
-+			*offset = max(*offset,
-+				      folio_pos(folio) + (i << SECTOR_SHIFT));
-+			goto unlock;
-+		}
-+
-+	*offset = folio_end_pos(folio);
-+	ret = false;
-+unlock:
-+	folio_unlock(folio);
-+	return ret;
-+}
-+
-+static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
-+				       loff_t start_offset,
-+				       loff_t end_offset,
-+				       unsigned min_replicas)
-+{
-+	struct address_space *mapping = vinode->i_mapping;
-+	loff_t offset = start_offset;
-+
-+	while (offset < end_offset &&
-+	       !folio_hole_offset(mapping, &offset, min_replicas))
-+		;
-+
-+	return min(offset, end_offset);
-+}
-+
-+static void bch2_clamp_data_hole(struct inode *inode,
-+				 u64 *hole_start,
-+				 u64 *hole_end,
-+				 unsigned min_replicas)
-+{
-+	*hole_start = bch2_seek_pagecache_hole(inode,
-+		*hole_start << 9, *hole_end << 9, min_replicas) >> 9;
-+
-+	if (*hole_start == *hole_end)
-+		return;
-+
-+	*hole_end = bch2_seek_pagecache_data(inode,
-+		*hole_start << 9, *hole_end << 9, min_replicas) >> 9;
-+}
-+
 +static loff_t bch2_seek_hole(struct file *file, u64 offset)
 +{
 +	struct bch_inode_info *inode = file_bch_inode(file);
@@ -54788,12 +55349,12 @@ index 000000000..6b691b2b5
 +			   BTREE_ITER_SLOTS, k, ret) {
 +		if (k.k->p.inode != inode->v.i_ino) {
 +			next_hole = bch2_seek_pagecache_hole(&inode->v,
-+					offset, MAX_LFS_FILESIZE, 0);
++					offset, MAX_LFS_FILESIZE, 0, false);
 +			break;
 +		} else if (!bkey_extent_is_data(k.k)) {
 +			next_hole = bch2_seek_pagecache_hole(&inode->v,
 +					max(offset, bkey_start_offset(k.k) << 9),
-+					k.k->p.offset << 9, 0);
++					k.k->p.offset << 9, 0, false);
 +
 +			if (next_hole < k.k->p.offset << 9)
 +				break;
@@ -54843,28 +55404,10 @@ index 000000000..6b691b2b5
 +void bch2_fs_fsio_exit(struct bch_fs *c)
 +{
 +	bioset_exit(&c->nocow_flush_bioset);
-+	bioset_exit(&c->dio_write_bioset);
-+	bioset_exit(&c->dio_read_bioset);
-+	bioset_exit(&c->writepage_bioset);
 +}
 +
 +int bch2_fs_fsio_init(struct bch_fs *c)
 +{
-+	if (bioset_init(&c->writepage_bioset,
-+			4, offsetof(struct bch_writepage_io, op.wbio.bio),
-+			BIOSET_NEED_BVECS))
-+		return -BCH_ERR_ENOMEM_writepage_bioset_init;
-+
-+	if (bioset_init(&c->dio_read_bioset,
-+			4, offsetof(struct dio_read, rbio.bio),
-+			BIOSET_NEED_BVECS))
-+		return -BCH_ERR_ENOMEM_dio_read_bioset_init;
-+
-+	if (bioset_init(&c->dio_write_bioset,
-+			4, offsetof(struct dio_write, op.wbio.bio),
-+			BIOSET_NEED_BVECS))
-+		return -BCH_ERR_ENOMEM_dio_write_bioset_init;
-+
 +	if (bioset_init(&c->nocow_flush_bioset,
 +			1, offsetof(struct nocow_flush, bio), 0))
 +		return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
@@ -54875,10 +55418,10 @@ index 000000000..6b691b2b5
 +#endif /* NO_BCACHEFS_FS */
 diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
 new file mode 100644
-index 000000000..af9053315
+index 000000000..bb5b709fa
 --- /dev/null
 +++ b/fs/bcachefs/fs-io.h
-@@ -0,0 +1,54 @@
+@@ -0,0 +1,184 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_FS_IO_H
 +#define _BCACHEFS_FS_IO_H
@@ -54886,29 +55429,164 @@ index 000000000..af9053315
 +#ifndef NO_BCACHEFS_FS
 +
 +#include "buckets.h"
++#include "fs.h"
 +#include "io_types.h"
++#include "quota.h"
 +
 +#include <linux/uio.h>
 +
-+struct quota_res;
++struct folio_vec {
++	struct folio	*fv_folio;
++	size_t		fv_offset;
++	size_t		fv_len;
++};
++
++static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
++{
++
++	struct folio *folio	= page_folio(bv.bv_page);
++	size_t offset		= (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
++		bv.bv_offset;
++	size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
++
++	return (struct folio_vec) {
++		.fv_folio	= folio,
++		.fv_offset	= offset,
++		.fv_len		= len,
++	};
++}
++
++static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
++						    struct bvec_iter iter)
++{
++	return biovec_to_foliovec(bio_iter_iovec(bio, iter));
++}
++
++#define __bio_for_each_folio(bvl, bio, iter, start)			\
++	for (iter = (start);						\
++	     (iter).bi_size &&						\
++		((bvl = bio_iter_iovec_folio((bio), (iter))), 1);	\
++	     bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
++
++/**
++ * bio_for_each_folio - iterate over folios within a bio
++ *
++ * Like other non-_all versions, this iterates over what bio->bi_iter currently
++ * points to. This version is for drivers, where the bio may have previously
++ * been split or cloned.
++ */
++#define bio_for_each_folio(bvl, bio, iter)				\
++	__bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
++
++struct quota_res {
++	u64				sectors;
++};
++
++#ifdef CONFIG_BCACHEFS_QUOTA
++
++static inline void __bch2_quota_reservation_put(struct bch_fs *c,
++					 struct bch_inode_info *inode,
++					 struct quota_res *res)
++{
++	BUG_ON(res->sectors > inode->ei_quota_reserved);
++
++	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
++			-((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
++	inode->ei_quota_reserved -= res->sectors;
++	res->sectors = 0;
++}
++
++static inline void bch2_quota_reservation_put(struct bch_fs *c,
++				       struct bch_inode_info *inode,
++				       struct quota_res *res)
++{
++	if (res->sectors) {
++		mutex_lock(&inode->ei_quota_lock);
++		__bch2_quota_reservation_put(c, inode, res);
++		mutex_unlock(&inode->ei_quota_lock);
++	}
++}
++
++static inline int bch2_quota_reservation_add(struct bch_fs *c,
++				      struct bch_inode_info *inode,
++				      struct quota_res *res,
++				      u64 sectors,
++				      bool check_enospc)
++{
++	int ret;
++
++	if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
++		return 0;
++
++	mutex_lock(&inode->ei_quota_lock);
++	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
++			      check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
++	if (likely(!ret)) {
++		inode->ei_quota_reserved += sectors;
++		res->sectors += sectors;
++	}
++	mutex_unlock(&inode->ei_quota_lock);
++
++	return ret;
++}
++
++#else
++
++static inline void __bch2_quota_reservation_put(struct bch_fs *c,
++					 struct bch_inode_info *inode,
++					 struct quota_res *res) {}
++
++static inline void bch2_quota_reservation_put(struct bch_fs *c,
++				       struct bch_inode_info *inode,
++				       struct quota_res *res) {}
++
++static inline int bch2_quota_reservation_add(struct bch_fs *c,
++				      struct bch_inode_info *inode,
++				      struct quota_res *res,
++				      unsigned sectors,
++				      bool check_enospc)
++{
++	return 0;
++}
++
++#endif
++
++void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
++			   struct quota_res *, s64);
++
++static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
++				       struct quota_res *quota_res, s64 sectors)
++{
++	if (sectors) {
++		mutex_lock(&inode->ei_quota_lock);
++		__bch2_i_sectors_acct(c, inode, quota_res, sectors);
++		mutex_unlock(&inode->ei_quota_lock);
++	}
++}
++
++static inline struct address_space *faults_disabled_mapping(void)
++{
++	return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
++}
++
++static inline void set_fdm_dropped_locks(void)
++{
++	current->faults_disabled_mapping =
++		(void *) (((unsigned long) current->faults_disabled_mapping)|1);
++}
++
++static inline bool fdm_dropped_locks(void)
++{
++	return ((unsigned long) current->faults_disabled_mapping) & 1;
++}
++
++void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
++			struct bch_inode_info *, struct closure *);
 +
 +int __must_check bch2_write_inode_size(struct bch_fs *,
 +				       struct bch_inode_info *,
 +				       loff_t, unsigned);
 +
-+int bch2_read_folio(struct file *, struct folio *);
-+
-+int bch2_writepages(struct address_space *, struct writeback_control *);
-+void bch2_readahead(struct readahead_control *);
-+
-+int bch2_write_begin(struct file *, struct address_space *, loff_t,
-+		     unsigned, struct page **, void **);
-+int bch2_write_end(struct file *, struct address_space *, loff_t,
-+		   unsigned, unsigned, struct page *, void *);
-+
-+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
-+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
-+
 +int bch2_fsync(struct file *, loff_t, loff_t, int);
 +
 +int bch2_truncate(struct mnt_idmap *,
@@ -54920,11 +55598,6 @@ index 000000000..af9053315
 +
 +loff_t bch2_llseek(struct file *, loff_t, int);
 +
-+vm_fault_t bch2_page_fault(struct vm_fault *);
-+vm_fault_t bch2_page_mkwrite(struct vm_fault *);
-+void bch2_invalidate_folio(struct folio *, size_t, size_t);
-+bool bch2_release_folio(struct folio *, gfp_t);
-+
 +void bch2_fs_fsio_exit(struct bch_fs *);
 +int bch2_fs_fsio_init(struct bch_fs *);
 +#else
@@ -54935,10 +55608,10 @@ index 000000000..af9053315
 +#endif /* _BCACHEFS_FS_IO_H */
 diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
 new file mode 100644
-index 000000000..dfa1bf73c
+index 000000000..141bcced0
 --- /dev/null
 +++ b/fs/bcachefs/fs-ioctl.c
-@@ -0,0 +1,556 @@
+@@ -0,0 +1,559 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#ifndef NO_BCACHEFS_FS
 +
@@ -54972,7 +55645,8 @@ index 000000000..dfa1bf73c
 +	bool			projinherit;
 +};
 +
-+static int bch2_inode_flags_set(struct bch_inode_info *inode,
++static int bch2_inode_flags_set(struct btree_trans *trans,
++				struct bch_inode_info *inode,
 +				struct bch_inode_unpacked *bi,
 +				void *p)
 +{
@@ -55065,7 +55739,8 @@ index 000000000..dfa1bf73c
 +	return copy_to_user(arg, &fa, sizeof(fa));
 +}
 +
-+static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
++static int fssetxattr_inode_update_fn(struct btree_trans *trans,
++				      struct bch_inode_info *inode,
 +				      struct bch_inode_unpacked *bi,
 +				      void *p)
 +{
@@ -55076,7 +55751,7 @@ index 000000000..dfa1bf73c
 +		bi->bi_project = s->projid;
 +	}
 +
-+	return bch2_inode_flags_set(inode, bi, p);
++	return bch2_inode_flags_set(trans, inode, bi, p);
 +}
 +
 +static int bch2_ioc_fssetxattr(struct bch_fs *c,
@@ -55133,7 +55808,8 @@ index 000000000..dfa1bf73c
 +	return ret;
 +}
 +
-+static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode,
++static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
++				   struct bch_inode_info *inode,
 +				   struct bch_inode_unpacked *bi,
 +				   void *p)
 +{
@@ -55584,10 +56260,10 @@ index 000000000..f201980ef
 +#endif /* _BCACHEFS_FS_IOCTL_H */
 diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
 new file mode 100644
-index 000000000..8d2f388b4
+index 000000000..80dcda43e
 --- /dev/null
 +++ b/fs/bcachefs/fs.c
-@@ -0,0 +1,1943 @@
+@@ -0,0 +1,1961 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#ifndef NO_BCACHEFS_FS
 +
@@ -55604,12 +56280,16 @@ index 000000000..8d2f388b4
 +#include "fs-common.h"
 +#include "fs-io.h"
 +#include "fs-ioctl.h"
++#include "fs-io-buffered.h"
++#include "fs-io-direct.h"
++#include "fs-io-pagecache.h"
 +#include "fsck.h"
 +#include "inode.h"
 +#include "io.h"
 +#include "journal.h"
 +#include "keylist.h"
 +#include "quota.h"
++#include "snapshot.h"
 +#include "super.h"
 +#include "xattr.h"
 +
@@ -55679,7 +56359,7 @@ index 000000000..8d2f388b4
 +
 +	ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
 +				BTREE_ITER_INTENT) ?:
-+		(set ? set(inode, &inode_u, p) : 0) ?:
++		(set ? set(&trans, inode, &inode_u, p) : 0) ?:
 +		bch2_inode_write(&trans, &iter, &inode_u) ?:
 +		bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
 +
@@ -55793,7 +56473,7 @@ index 000000000..8d2f388b4
 +
 +	if (ret) {
 +		iget_failed(&inode->v);
-+		return ERR_PTR(ret);
++		return ERR_PTR(bch2_err_class(ret));
 +	}
 +
 +	mutex_lock(&c->vfs_inodes_lock);
@@ -56590,11 +57270,16 @@ index 000000000..8d2f388b4
 +{
 +	struct bch_inode_info *inode = file_bch_inode(file);
 +	struct bch_fs *c = inode->v.i_sb->s_fs_info;
++	int ret;
 +
 +	if (!dir_emit_dots(file, ctx))
 +		return 0;
 +
-+	return bch2_readdir(c, inode_inum(inode), ctx);
++	ret = bch2_readdir(c, inode_inum(inode), ctx);
++	if (ret)
++		bch_err_fn(c, ret);
++
++	return bch2_err_class(ret);
 +}
 +
 +static const struct file_operations bch_file_operations = {
@@ -56819,7 +57504,8 @@ index 000000000..8d2f388b4
 +	struct bch_inode_unpacked inode_u;
 +	subvol_inum target;
 +	u32 snapshot;
-+	unsigned name_len;
++	struct qstr dirent_name;
++	unsigned name_len = 0;
 +	int ret;
 +
 +	if (!S_ISDIR(dir->v.i_mode))
@@ -56896,9 +57582,10 @@ index 000000000..8d2f388b4
 +	ret = -ENOENT;
 +	goto err;
 +found:
-+	name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX);
++	dirent_name = bch2_dirent_get_name(d);
 +
-+	memcpy(name, d.v->d_name, name_len);
++	name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
++	memcpy(name, dirent_name.name, name_len);
 +	name[name_len] = '\0';
 +err:
 +	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -56996,7 +57683,8 @@ index 000000000..8d2f388b4
 +	call_rcu(&vinode->i_rcu, bch2_i_callback);
 +}
 +
-+static int inode_update_times_fn(struct bch_inode_info *inode,
++static int inode_update_times_fn(struct btree_trans *trans,
++				 struct bch_inode_info *inode,
 +				 struct bch_inode_unpacked *bi,
 +				 void *p)
 +{
@@ -57484,7 +58172,10 @@ index 000000000..8d2f388b4
 +	return dget(sb->s_root);
 +
 +err_put_super:
++	sb->s_fs_info = NULL;
++	c->vfs_sb = NULL;
 +	deactivate_locked_super(sb);
++	bch2_fs_stop(c);
 +	return ERR_PTR(bch2_err_class(ret));
 +}
 +
@@ -57492,8 +58183,11 @@ index 000000000..8d2f388b4
 +{
 +	struct bch_fs *c = sb->s_fs_info;
 +
++	if (c)
++		c->vfs_sb = NULL;
 +	generic_shutdown_super(sb);
-+	bch2_fs_free(c);
++	if (c)
++		bch2_fs_free(c);
 +}
 +
 +static struct file_system_type bcache_fs_type = {
@@ -57533,10 +58227,10 @@ index 000000000..8d2f388b4
 +#endif /* NO_BCACHEFS_FS */
 diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
 new file mode 100644
-index 000000000..6170d214d
+index 000000000..10e11119d
 --- /dev/null
 +++ b/fs/bcachefs/fs.h
-@@ -0,0 +1,208 @@
+@@ -0,0 +1,209 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_FS_H
 +#define _BCACHEFS_FS_H
@@ -57713,7 +58407,8 @@ index 000000000..6170d214d
 +struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
 +
 +/* returns 0 if we want to do the update, or error is passed up */
-+typedef int (*inode_set_fn)(struct bch_inode_info *,
++typedef int (*inode_set_fn)(struct btree_trans *,
++			    struct bch_inode_info *,
 +			    struct bch_inode_unpacked *, void *);
 +
 +void bch2_inode_update_after_write(struct btree_trans *,
@@ -57747,10 +58442,10 @@ index 000000000..6170d214d
 +#endif /* _BCACHEFS_FS_H */
 diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
 new file mode 100644
-index 000000000..d3eb3dc1c
+index 000000000..238caeeaf
 --- /dev/null
 +++ b/fs/bcachefs/fsck.c
-@@ -0,0 +1,2471 @@
+@@ -0,0 +1,2483 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -57764,7 +58459,8 @@ index 000000000..d3eb3dc1c
 +#include "fsck.h"
 +#include "inode.h"
 +#include "keylist.h"
-+#include "subvolume.h"
++#include "recovery.h"
++#include "snapshot.h"
 +#include "super.h"
 +#include "xattr.h"
 +
@@ -57972,69 +58668,6 @@ index 000000000..d3eb3dc1c
 +	return ret;
 +}
 +
-+static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_iter iter = { NULL };
-+	struct bkey_i_inode_generation delete;
-+	struct bch_inode_unpacked inode_u;
-+	struct bkey_s_c k;
-+	int ret;
-+
-+	do {
-+		ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-+						      SPOS(inum, 0, snapshot),
-+						      SPOS(inum, U64_MAX, snapshot),
-+						      0, NULL) ?:
-+			bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
-+						      SPOS(inum, 0, snapshot),
-+						      SPOS(inum, U64_MAX, snapshot),
-+						      0, NULL) ?:
-+			bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
-+						      SPOS(inum, 0, snapshot),
-+						      SPOS(inum, U64_MAX, snapshot),
-+						      0, NULL);
-+	} while (ret == -BCH_ERR_transaction_restart_nested);
-+	if (ret)
-+		goto err;
-+retry:
-+	bch2_trans_begin(trans);
-+
-+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-+			       SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
-+	ret = bkey_err(k);
-+	if (ret)
-+		goto err;
-+
-+	if (!bkey_is_inode(k.k)) {
-+		bch2_fs_inconsistent(c,
-+				     "inode %llu:%u not found when deleting",
-+				     inum, snapshot);
-+		ret = -EIO;
-+		goto err;
-+	}
-+
-+	bch2_inode_unpack(k, &inode_u);
-+
-+	/* Subvolume root? */
-+	if (inode_u.bi_subvol)
-+		bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
-+
-+	bkey_inode_generation_init(&delete.k_i);
-+	delete.k.p = iter.pos;
-+	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
-+
-+	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
-+		bch2_trans_commit(trans, NULL, NULL,
-+				BTREE_INSERT_NOFAIL);
-+err:
-+	bch2_trans_iter_exit(trans, &iter);
-+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-+		goto retry;
-+
-+	return ret ?: -BCH_ERR_transaction_restart_nested;
-+}
-+
 +static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 +{
 +	struct bch_fs *c = trans->c;
@@ -58224,6 +58857,28 @@ index 000000000..d3eb3dc1c
 +	memset(s, 0, sizeof(*s));
 +}
 +
++static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
++{
++	struct snapshots_seen_entry *i, n = {
++		.id	= id,
++		.equiv	= bch2_snapshot_equiv(c, id),
++	};
++	int ret = 0;
++
++	darray_for_each(s->ids, i) {
++		if (i->id == id)
++			return 0;
++		if (i->id > id)
++			break;
++	}
++
++	ret = darray_insert_item(&s->ids, i - s->ids.data, n);
++	if (ret)
++		bch_err(c, "error reallocating snapshots_seen table (size %zu)",
++			s->ids.size);
++	return ret;
++}
++
 +static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
 +				 enum btree_id btree_id, struct bpos pos)
 +{
@@ -58646,14 +59301,6 @@ index 000000000..d3eb3dc1c
 +	if (ret)
 +		goto err;
 +
-+	/*
-+	 * if snapshot id isn't a leaf node, skip it - deletion in
-+	 * particular is not atomic, so on the internal snapshot nodes
-+	 * we can see inodes marked for deletion after a clean shutdown
-+	 */
-+	if (bch2_snapshot_is_internal_node(c, k.k->p.snapshot))
-+		return 0;
-+
 +	if (!bkey_is_inode(k.k))
 +		return 0;
 +
@@ -58675,6 +59322,27 @@ index 000000000..d3eb3dc1c
 +		return -EINVAL;
 +	}
 +
++	if ((u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED)) &&
++	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
++		struct bpos new_min_pos;
++
++		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
++		if (ret)
++			goto err;
++
++		u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED;
++
++		ret = __write_inode(trans, &u, iter->pos.snapshot);
++		if (ret) {
++			bch_err_msg(c, ret, "in fsck: error updating inode");
++			return ret;
++		}
++
++		if (!bpos_eq(new_min_pos, POS_MIN))
++			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
++		return 0;
++	}
++
 +	if (u.bi_flags & BCH_INODE_UNLINKED &&
 +	    (!c->sb.clean ||
 +	     fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
@@ -58682,7 +59350,7 @@ index 000000000..d3eb3dc1c
 +		bch2_trans_unlock(trans);
 +		bch2_fs_lazy_rw(c);
 +
-+		ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
++		ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
 +		if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
 +			bch_err(c, "error in fsck: error while deleting inode: %s",
 +				bch2_err_str(ret));
@@ -58753,9 +59421,10 @@ index 000000000..d3eb3dc1c
 +
 +	if (do_update) {
 +		ret = __write_inode(trans, &u, iter->pos.snapshot);
-+		if (ret)
-+			bch_err(c, "error in fsck: error updating inode: %s",
-+				bch2_err_str(ret));
++		if (ret) {
++			bch_err_msg(c, ret, "in fsck: error updating inode");
++			return ret;
++		}
 +	}
 +err:
 +fsck_err:
@@ -58938,74 +59607,116 @@ index 000000000..d3eb3dc1c
 +
 +static int overlapping_extents_found(struct btree_trans *trans,
 +				     enum btree_id btree,
-+				     struct bpos pos1, struct bkey pos2,
-+				     bool *fixed)
++				     struct bpos pos1, struct snapshots_seen *pos1_seen,
++				     struct bkey pos2,
++				     bool *fixed,
++				     struct extent_end *extent_end)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct printbuf buf = PRINTBUF;
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	u32 snapshot = min(pos1.snapshot, pos2.p.snapshot);
++	struct btree_iter iter1, iter2 = { NULL };
++	struct bkey_s_c k1, k2;
 +	int ret;
 +
 +	BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2)));
 +
-+	bch2_trans_iter_init(trans, &iter, btree, SPOS(pos1.inode, pos1.offset - 1, snapshot), 0);
-+	k = bch2_btree_iter_peek_upto(&iter, POS(pos1.inode, U64_MAX));
-+	ret = bkey_err(k);
++	bch2_trans_iter_init(trans, &iter1, btree, pos1,
++			     BTREE_ITER_ALL_SNAPSHOTS|
++			     BTREE_ITER_NOT_EXTENTS);
++	k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX));
++	ret = bkey_err(k1);
 +	if (ret)
 +		goto err;
 +
 +	prt_str(&buf, "\n  ");
-+	bch2_bkey_val_to_text(&buf, c, k);
++	bch2_bkey_val_to_text(&buf, c, k1);
 +
-+	if (!bpos_eq(pos1, k.k->p)) {
-+		bch_err(c, "%s: error finding first overlapping extent when repairing%s",
++	if (!bpos_eq(pos1, k1.k->p)) {
++		prt_str(&buf, "\n  wanted\n  ");
++		bch2_bpos_to_text(&buf, pos1);
++		prt_str(&buf, "\n  ");
++		bch2_bkey_to_text(&buf, &pos2);
++
++		bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
 +			__func__, buf.buf);
 +		ret = -BCH_ERR_internal_fsck_err;
 +		goto err;
 +	}
 +
-+	while (1) {
-+		bch2_btree_iter_advance(&iter);
++	bch2_trans_copy_iter(&iter2, &iter1);
 +
-+		k = bch2_btree_iter_peek_upto(&iter, POS(pos1.inode, U64_MAX));
-+		ret = bkey_err(k);
++	while (1) {
++		bch2_btree_iter_advance(&iter2);
++
++		k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX));
++		ret = bkey_err(k2);
 +		if (ret)
 +			goto err;
 +
-+		if (bkey_ge(k.k->p, pos2.p))
++		if (bpos_ge(k2.k->p, pos2.p))
 +			break;
-+
 +	}
 +
 +	prt_str(&buf, "\n  ");
-+	bch2_bkey_val_to_text(&buf, c, k);
++	bch2_bkey_val_to_text(&buf, c, k2);
 +
-+	if (bkey_gt(k.k->p, pos2.p) ||
-+	    pos2.size != k.k->size) {
++	if (bpos_gt(k2.k->p, pos2.p) ||
++	    pos2.size != k2.k->size) {
 +		bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
 +			__func__, buf.buf);
 +		ret = -BCH_ERR_internal_fsck_err;
 +		goto err;
 +	}
 +
-+	if (fsck_err(c, "overlapping extents%s", buf.buf)) {
-+		struct bpos update_pos = pos1.snapshot < pos2.p.snapshot ? pos1 : pos2.p;
-+		struct btree_iter update_iter;
++	prt_printf(&buf, "\n  overwriting %s extent",
++		   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
 +
-+		struct bkey_i *update = bch2_bkey_get_mut(trans, &update_iter,
-+						btree, update_pos,
-+						BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-+		bch2_trans_iter_exit(trans, &update_iter);
-+		if ((ret = PTR_ERR_OR_ZERO(update)))
++	if (fsck_err(c, "overlapping extents%s", buf.buf)) {
++		struct btree_iter *old_iter = &iter1;
++		struct disk_reservation res = { 0 };
++
++		if (pos1.snapshot < pos2.p.snapshot) {
++			old_iter = &iter2;
++			swap(k1, k2);
++		}
++
++		trans->extra_journal_res += bch2_bkey_sectors_compressed(k2);
++
++		ret =   bch2_trans_update_extent_overwrite(trans, old_iter,
++				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
++				k1, k2) ?:
++			bch2_trans_commit(trans, &res, NULL,
++				BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL);
++		bch2_disk_reservation_put(c, &res);
++
++		if (ret)
 +			goto err;
 +
 +		*fixed = true;
++
++		if (pos1.snapshot == pos2.p.snapshot) {
++			/*
++			 * We overwrote the first extent, and did the overwrite
++			 * in the same snapshot:
++			 */
++			extent_end->offset = bkey_start_offset(&pos2);
++		} else if (pos1.snapshot > pos2.p.snapshot) {
++			/*
++			 * We overwrote the first extent in pos2's snapshot:
++			 */
++			ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
++		} else {
++			/*
++			 * We overwrote the second extent - restart
++			 * check_extent() from the top:
++			 */
++			ret = -BCH_ERR_transaction_restart_nested;
++		}
 +	}
 +fsck_err:
 +err:
-+	bch2_trans_iter_exit(trans, &iter);
++	bch2_trans_iter_exit(trans, &iter2);
++	bch2_trans_iter_exit(trans, &iter1);
 +	printbuf_exit(&buf);
 +	return ret;
 +}
@@ -59015,11 +59726,11 @@ index 000000000..d3eb3dc1c
 +			      struct extent_ends *extent_ends,
 +			      struct bkey_s_c k,
 +			      u32 equiv,
-+			      struct btree_iter *iter)
++			      struct btree_iter *iter,
++			      bool *fixed)
 +{
 +	struct bch_fs *c = trans->c;
 +	struct extent_end *i;
-+	bool fixed = false;
 +	int ret = 0;
 +
 +	/* transaction restart, running again */
@@ -59042,7 +59753,8 @@ index 000000000..d3eb3dc1c
 +						SPOS(iter->pos.inode,
 +						     i->offset,
 +						     i->snapshot),
-+						*k.k, &fixed);
++						&i->seen,
++						*k.k, fixed, i);
 +		if (ret)
 +			goto err;
 +	}
@@ -59053,7 +59765,7 @@ index 000000000..d3eb3dc1c
 +
 +	extent_ends->last_pos = k.k->p;
 +err:
-+	return ret ?: fixed;
++	return ret;
 +}
 +
 +static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
@@ -59108,13 +59820,10 @@ index 000000000..d3eb3dc1c
 +			goto delete;
 +
 +		ret = check_overlapping_extents(trans, s, extent_ends, k,
-+						equiv.snapshot, iter);
-+		if (ret < 0)
-+			goto err;
-+
++						equiv.snapshot, iter,
++						&inode->recalculate_sums);
 +		if (ret)
-+			inode->recalculate_sums = true;
-+		ret = 0;
++			goto err;
 +	}
 +
 +	/*
@@ -59189,7 +59898,7 @@ index 000000000..d3eb3dc1c
 +
 +	snapshots_seen_init(&s);
 +	extent_ends_init(&extent_ends);
-+	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
++	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 +
 +	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
 +			POS(BCACHEFS_ROOT_INO, 0),
@@ -59894,8 +60603,6 @@ index 000000000..d3eb3dc1c
 +	return ret;
 +}
 +
-+/* check_nlink pass: */
-+
 +struct nlink_table {
 +	size_t		nr;
 +	size_t		size;
@@ -60244,14 +60951,15 @@ index 000000000..90c87b508
 +#endif /* _BCACHEFS_FSCK_H */
 diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
 new file mode 100644
-index 000000000..8834809d4
+index 000000000..8114b6e4f
 --- /dev/null
 +++ b/fs/bcachefs/inode.c
-@@ -0,0 +1,925 @@
+@@ -0,0 +1,1111 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
 +#include "btree_key_cache.h"
++#include "btree_write_buffer.h"
 +#include "bkey_methods.h"
 +#include "btree_update.h"
 +#include "buckets.h"
@@ -60260,6 +60968,7 @@ index 000000000..8834809d4
 +#include "extent_update.h"
 +#include "inode.h"
 +#include "str_hash.h"
++#include "snapshot.h"
 +#include "subvolume.h"
 +#include "varint.h"
 +
@@ -60597,6 +61306,8 @@ index 000000000..8834809d4
 +	return 0;
 +err:
 +	bch2_trans_iter_exit(trans, iter);
++	if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
 +	return ret;
 +}
 +
@@ -60769,6 +61480,25 @@ index 000000000..8834809d4
 +	__bch2_inode_unpacked_to_text(out, &inode);
 +}
 +
++static inline u64 bkey_inode_flags(struct bkey_s_c k)
++{
++	switch (k.k->type) {
++	case KEY_TYPE_inode:
++		return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
++	case KEY_TYPE_inode_v2:
++		return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
++	case KEY_TYPE_inode_v3:
++		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
++	default:
++		return 0;
++	}
++}
++
++static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
++{
++	return bkey_inode_flags(k) & BCH_INODE_UNLINKED;
++}
++
 +int bch2_trans_mark_inode(struct btree_trans *trans,
 +			  enum btree_id btree_id, unsigned level,
 +			  struct bkey_s_c old,
@@ -60776,6 +61506,8 @@ index 000000000..8834809d4
 +			  unsigned flags)
 +{
 +	int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
++	bool old_deleted = bkey_is_deleted_inode(old);
++	bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new));
 +
 +	if (nr) {
 +		int ret = bch2_replicas_deltas_realloc(trans, 0);
@@ -60787,6 +61519,12 @@ index 000000000..8834809d4
 +		d->nr_inodes += nr;
 +	}
 +
++	if (old_deleted != new_deleted) {
++		int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted);
++		if (ret)
++			return ret;
++	}
++
 +	return 0;
 +}
 +
@@ -61173,12 +61911,167 @@ index 000000000..8834809d4
 +	if (opts->nocow)
 +		opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
 +}
++
++int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter = { NULL };
++	struct bkey_i_inode_generation delete;
++	struct bch_inode_unpacked inode_u;
++	struct bkey_s_c k;
++	int ret;
++
++	do {
++		ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
++						      SPOS(inum, 0, snapshot),
++						      SPOS(inum, U64_MAX, snapshot),
++						      0, NULL) ?:
++			bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
++						      SPOS(inum, 0, snapshot),
++						      SPOS(inum, U64_MAX, snapshot),
++						      0, NULL) ?:
++			bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
++						      SPOS(inum, 0, snapshot),
++						      SPOS(inum, U64_MAX, snapshot),
++						      0, NULL);
++	} while (ret == -BCH_ERR_transaction_restart_nested);
++	if (ret)
++		goto err;
++retry:
++	bch2_trans_begin(trans);
++
++	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
++			       SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	if (!bkey_is_inode(k.k)) {
++		bch2_fs_inconsistent(c,
++				     "inode %llu:%u not found when deleting",
++				     inum, snapshot);
++		ret = -EIO;
++		goto err;
++	}
++
++	bch2_inode_unpack(k, &inode_u);
++
++	/* Subvolume root? */
++	if (inode_u.bi_subvol)
++		bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum);
++
++	bkey_inode_generation_init(&delete.k_i);
++	delete.k.p = iter.pos;
++	delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
++
++	ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
++		bch2_trans_commit(trans, NULL, NULL,
++				BTREE_INSERT_NOFAIL);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
++		goto retry;
++
++	return ret ?: -BCH_ERR_transaction_restart_nested;
++}
++
++static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bch_inode_unpacked inode;
++	int ret;
++
++	if (bch2_snapshot_is_internal_node(c, pos.snapshot))
++		return 0;
++
++	if (!fsck_err_on(c->sb.clean, c,
++			 "filesystem marked as clean but have deleted inode %llu:%u",
++			 pos.offset, pos.snapshot))
++		return 0;
++
++	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
++	ret = bkey_err(k);
++	if (ret)
++		return ret;
++
++	ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
++	if (fsck_err_on(!bkey_is_inode(k.k), c,
++			"nonexistent inode %llu:%u in deleted_inodes btree",
++			pos.offset, pos.snapshot))
++		goto delete;
++
++	ret = bch2_inode_unpack(k, &inode);
++	if (ret)
++		goto err;
++
++	if (fsck_err_on(S_ISDIR(inode.bi_mode), c,
++			"directory %llu:%u in deleted_inodes btree",
++			pos.offset, pos.snapshot))
++		goto delete;
++
++	if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c,
++			"non-deleted inode %llu:%u in deleted_inodes btree",
++			pos.offset, pos.snapshot))
++		goto delete;
++
++	return 1;
++err:
++fsck_err:
++	return ret;
++delete:
++	return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
++}
++
++int bch2_delete_dead_inodes(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	ret = bch2_btree_write_buffer_flush_sync(&trans);
++	if (ret)
++		goto err;
++
++	/*
++	 * Weird transaction restart handling here because on successful delete,
++	 * bch2_inode_rm_snapshot() will return a nested transaction restart,
++	 * but we can't retry because the btree write buffer won't have been
++	 * flushed and we'd spin:
++	 */
++	for_each_btree_key(&trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
++			   BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
++		ret = lockrestart_do(&trans, may_delete_deleted_inode(&trans, k.k->p));
++		if (ret < 0)
++			break;
++
++		if (ret) {
++			if (!test_bit(BCH_FS_RW, &c->flags)) {
++				bch2_trans_unlock(&trans);
++				bch2_fs_lazy_rw(c);
++			}
++
++			ret = bch2_inode_rm_snapshot(&trans, k.k->p.offset, k.k->p.snapshot);
++			if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
++				break;
++		}
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++err:
++	bch2_trans_exit(&trans);
++
++	return ret;
++}
 diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
 new file mode 100644
-index 000000000..7809d1b6d
+index 000000000..22b244056
 --- /dev/null
 +++ b/fs/bcachefs/inode.h
-@@ -0,0 +1,201 @@
+@@ -0,0 +1,204 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_INODE_H
 +#define _BCACHEFS_INODE_H
@@ -61379,13 +62272,16 @@ index 000000000..7809d1b6d
 +void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
 +			 struct bch_inode_unpacked *);
 +
++int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
++int bch2_delete_dead_inodes(struct bch_fs *);
++
 +#endif /* _BCACHEFS_INODE_H */
 diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
 new file mode 100644
-index 000000000..5bacc6a9d
+index 000000000..3c614c864
 --- /dev/null
 +++ b/fs/bcachefs/io.c
-@@ -0,0 +1,3059 @@
+@@ -0,0 +1,3051 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Some low level IO code, and hacks for various block layer limitations
@@ -61768,10 +62664,10 @@ index 000000000..5bacc6a9d
 +	struct bch_fs *c = trans->c;
 +	struct disk_reservation disk_res = { 0 };
 +	struct closure cl;
-+	struct open_buckets open_buckets;
++	struct open_buckets open_buckets = { 0 };
 +	struct bkey_s_c k;
 +	struct bkey_buf old, new;
-+	unsigned sectors_allocated;
++	unsigned sectors_allocated = 0;
 +	bool have_reservation = false;
 +	bool unwritten = opts.nocow &&
 +	    c->sb.version >= bcachefs_metadata_version_unwritten_extents;
@@ -61780,9 +62676,6 @@ index 000000000..5bacc6a9d
 +	bch2_bkey_buf_init(&old);
 +	bch2_bkey_buf_init(&new);
 +	closure_init_stack(&cl);
-+	open_buckets.nr = 0;
-+retry:
-+	sectors_allocated = 0;
 +
 +	k = bch2_btree_iter_peek_slot(iter);
 +	ret = bkey_err(k);
@@ -61801,14 +62694,14 @@ index 000000000..5bacc6a9d
 +		 */
 +		ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
 +		if (unlikely(ret))
-+			goto out;
++			goto err;
 +
 +		bch2_bkey_buf_reassemble(&old, c, k);
 +	}
 +
 +	if (have_reservation) {
 +		if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
-+			goto out;
++			goto err;
 +
 +		bch2_key_resize(&new.k->k, sectors);
 +	} else if (!unwritten) {
@@ -61840,13 +62733,10 @@ index 000000000..5bacc6a9d
 +				opts.data_replicas,
 +				opts.data_replicas,
 +				BCH_WATERMARK_normal, 0, &cl, &wp);
-+		if (ret) {
-+			bch2_trans_unlock(trans);
-+			closure_sync(&cl);
-+			if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
-+				goto retry;
-+			return ret;
-+		}
++		if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
++			ret = -BCH_ERR_transaction_restart_nested;
++		if (ret)
++			goto err;
 +
 +		sectors = min(sectors, wp->sectors_free);
 +		sectors_allocated = sectors;
@@ -61865,17 +62755,7 @@ index 000000000..5bacc6a9d
 +
 +	ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
 +				 0, i_sectors_delta, true);
-+out:
-+	if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) {
-+		bch2_trans_unlock(trans);
-+		closure_sync(&cl);
-+	}
-+
-+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-+		bch2_trans_begin(trans);
-+		goto retry;
-+	}
-+
++err:
 +	if (!ret && sectors_allocated)
 +		bch2_increment_clock(c, sectors_allocated, WRITE);
 +
@@ -61884,6 +62764,11 @@ index 000000000..5bacc6a9d
 +	bch2_bkey_buf_exit(&new, c);
 +	bch2_bkey_buf_exit(&old, c);
 +
++	if (closure_nr_remaining(&cl) != 1) {
++		bch2_trans_unlock(trans);
++		closure_sync(&cl);
++	}
++
 +	return ret;
 +}
 +
@@ -62098,13 +62983,15 @@ index 000000000..5bacc6a9d
 +	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 +	struct bch_fs *c = op->c;
 +
++	EBUG_ON(op->open_buckets.nr);
++
++	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 +	bch2_disk_reservation_put(c, &op->res);
++
 +	if (!(op->flags & BCH_WRITE_MOVE))
 +		bch2_write_ref_put(c, BCH_WRITE_REF_write);
 +	bch2_keylist_free(&op->insert_keys, op->inline_keys);
 +
-+	bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
-+
 +	EBUG_ON(cl->parent);
 +	closure_debug_destroy(cl);
 +	if (op->end_io)
@@ -63823,6 +64710,7 @@ index 000000000..5bacc6a9d
 +
 +		if (rbio->bounce) {
 +			struct bvec_iter src_iter = src->bi_iter;
++
 +			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
 +		}
 +	}
@@ -64447,7 +65335,7 @@ index 000000000..5bacc6a9d
 +}
 diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
 new file mode 100644
-index 000000000..1476380d5
+index 000000000..831e3f1b7
 --- /dev/null
 +++ b/fs/bcachefs/io.h
 @@ -0,0 +1,202 @@
@@ -64505,7 +65393,7 @@ index 000000000..1476380d5
 +};
 +
 +enum bch_write_flags {
-+#define x(f)	BCH_WRITE_##f = 1U << __BCH_WRITE_##f,
++#define x(f)	BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
 +	BCH_WRITE_FLAGS()
 +#undef x
 +};
@@ -64826,7 +65714,7 @@ index 000000000..737f16d78
 +#endif /* _BCACHEFS_IO_TYPES_H */
 diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
 new file mode 100644
-index 000000000..80a612c05
+index 000000000..055920c26
 --- /dev/null
 +++ b/fs/bcachefs/journal.c
 @@ -0,0 +1,1438 @@
@@ -64895,6 +65783,7 @@ index 000000000..80a612c05
 +static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
 +{
 +	unsigned i;
++
 +	for (i = 0; i < ARRAY_SIZE(p->list); i++)
 +		INIT_LIST_HEAD(&p->list[i]);
 +	INIT_LIST_HEAD(&p->flushed);
@@ -65346,8 +66235,7 @@ index 000000000..80a612c05
 +	int ret;
 +
 +	closure_wait_event(&j->async_wait,
-+		   (ret = __journal_res_get(j, res, flags)) !=
-+		   -BCH_ERR_journal_res_get_blocked||
++		   (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
 +		   (flags & JOURNAL_RES_GET_NONBLOCK));
 +	return ret;
 +}
@@ -66802,10 +67690,10 @@ index 000000000..008a2e25a
 +#endif /* _BCACHEFS_JOURNAL_H */
 diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
 new file mode 100644
-index 000000000..f861ae2f1
+index 000000000..34740dca4
 --- /dev/null
 +++ b/fs/bcachefs/journal_io.c
-@@ -0,0 +1,1863 @@
+@@ -0,0 +1,1888 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#include "bcachefs.h"
 +#include "alloc_background.h"
@@ -66822,6 +67710,7 @@ index 000000000..f861ae2f1
 +#include "journal_reclaim.h"
 +#include "journal_seq_blacklist.h"
 +#include "replicas.h"
++#include "sb-clean.h"
 +#include "trace.h"
 +
 +static struct nonce journal_nonce(const struct jset *jset)
@@ -67016,33 +67905,41 @@ index 000000000..f861ae2f1
 +#define JOURNAL_ENTRY_BAD	7
 +
 +static void journal_entry_err_msg(struct printbuf *out,
++				  u32 version,
 +				  struct jset *jset,
 +				  struct jset_entry *entry)
 +{
-+	prt_str(out, "invalid journal entry ");
-+	if (entry)
-+		prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]);
++	prt_str(out, "invalid journal entry, version=");
++	bch2_version_to_text(out, version);
++
++	if (entry) {
++		prt_str(out, " type=");
++		prt_str(out, bch2_jset_entry_types[entry->type]);
++	}
++
++	if (!jset) {
++		prt_printf(out, " in superblock");
++	} else {
++
++		prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
++
++		if (entry)
++			prt_printf(out, " offset=%zi/%u",
++				   (u64 *) entry - jset->_data,
++				   le32_to_cpu(jset->u64s));
++	}
 +
-+	if (!jset)
-+		prt_printf(out, "in superblock");
-+	else if (!entry)
-+		prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq));
-+	else
-+		prt_printf(out, "at offset %zi/%u seq %llu",
-+			   (u64 *) entry - jset->_data,
-+			   le32_to_cpu(jset->u64s),
-+			   le64_to_cpu(jset->seq));
 +	prt_str(out, ": ");
 +}
 +
-+#define journal_entry_err(c, jset, entry, msg, ...)			\
++#define journal_entry_err(c, version, jset, entry, msg, ...)		\
 +({									\
 +	struct printbuf buf = PRINTBUF;					\
 +									\
-+	journal_entry_err_msg(&buf, jset, entry);			\
++	journal_entry_err_msg(&buf, version, jset, entry);		\
 +	prt_printf(&buf, msg, ##__VA_ARGS__);				\
 +									\
-+	switch (write) {						\
++	switch (flags & BKEY_INVALID_WRITE) {				\
 +	case READ:							\
 +		mustfix_fsck_err(c, "%s", buf.buf);			\
 +		break;							\
@@ -67059,8 +67956,8 @@ index 000000000..f861ae2f1
 +	true;								\
 +})
 +
-+#define journal_entry_err_on(cond, c, jset, entry, msg, ...)		\
-+	((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false)
++#define journal_entry_err_on(cond, c, version, jset, entry, msg, ...)	\
++	((cond) ? journal_entry_err(c, version, jset, entry, msg, ##__VA_ARGS__) : false)
 +
 +#define FSCK_DELETED_KEY	5
 +
@@ -67069,13 +67966,15 @@ index 000000000..f861ae2f1
 +				struct jset_entry *entry,
 +				unsigned level, enum btree_id btree_id,
 +				struct bkey_i *k,
-+				unsigned version, int big_endian, int write)
++				unsigned version, int big_endian,
++				enum bkey_invalid_flags flags)
 +{
++	int write = flags & BKEY_INVALID_WRITE;
 +	void *next = vstruct_next(entry);
 +	struct printbuf buf = PRINTBUF;
 +	int ret = 0;
 +
-+	if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) {
++	if (journal_entry_err_on(!k->k.u64s, c, version, jset, entry, "k->u64s 0")) {
 +		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
 +		journal_entry_null_range(vstruct_next(entry), next);
 +		return FSCK_DELETED_KEY;
@@ -67083,7 +67982,7 @@ index 000000000..f861ae2f1
 +
 +	if (journal_entry_err_on((void *) bkey_next(k) >
 +				 (void *) vstruct_next(entry),
-+				 c, jset, entry,
++				 c, version, jset, entry,
 +				 "extends past end of journal entry")) {
 +		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
 +		journal_entry_null_range(vstruct_next(entry), next);
@@ -67091,7 +67990,7 @@ index 000000000..f861ae2f1
 +	}
 +
 +	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
-+				 c, jset, entry,
++				 c, version, jset, entry,
 +				 "bad format %u", k->k.format)) {
 +		le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
 +		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -67106,11 +68005,7 @@ index 000000000..f861ae2f1
 +	if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
 +			      __btree_node_type(level, btree_id), write, &buf)) {
 +		printbuf_reset(&buf);
-+		prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:",
-+			   bch2_jset_entry_types[entry->type],
-+			   (u64 *) entry - jset->_data,
-+			   le32_to_cpu(jset->u64s),
-+			   le64_to_cpu(jset->seq));
++		journal_entry_err_msg(&buf, version, jset, entry);
 +		prt_newline(&buf);
 +		printbuf_indent_add(&buf, 2);
 +
@@ -67138,9 +68033,10 @@ index 000000000..f861ae2f1
 +}
 +
 +static int journal_entry_btree_keys_validate(struct bch_fs *c,
-+					     struct jset *jset,
-+					     struct jset_entry *entry,
-+					     unsigned version, int big_endian, int write)
++				struct jset *jset,
++				struct jset_entry *entry,
++				unsigned version, int big_endian,
++				enum bkey_invalid_flags flags)
 +{
 +	struct bkey_i *k = entry->start;
 +
@@ -67149,7 +68045,7 @@ index 000000000..f861ae2f1
 +					       entry->level,
 +					       entry->btree_id,
 +					       k, version, big_endian,
-+					       write|BKEY_INVALID_JOURNAL);
++					       flags|BKEY_INVALID_JOURNAL);
 +		if (ret == FSCK_DELETED_KEY)
 +			continue;
 +
@@ -67177,16 +68073,17 @@ index 000000000..f861ae2f1
 +}
 +
 +static int journal_entry_btree_root_validate(struct bch_fs *c,
-+					     struct jset *jset,
-+					     struct jset_entry *entry,
-+					     unsigned version, int big_endian, int write)
++				struct jset *jset,
++				struct jset_entry *entry,
++				unsigned version, int big_endian,
++				enum bkey_invalid_flags flags)
 +{
 +	struct bkey_i *k = entry->start;
 +	int ret = 0;
 +
 +	if (journal_entry_err_on(!entry->u64s ||
 +				 le16_to_cpu(entry->u64s) != k->k.u64s,
-+				 c, jset, entry,
++				 c, version, jset, entry,
 +				 "invalid btree root journal entry: wrong number of keys")) {
 +		void *next = vstruct_next(entry);
 +		/*
@@ -67200,7 +68097,7 @@ index 000000000..f861ae2f1
 +	}
 +
 +	return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
-+				    version, big_endian, write);
++				    version, big_endian, flags);
 +fsck_err:
 +	return ret;
 +}
@@ -67212,9 +68109,10 @@ index 000000000..f861ae2f1
 +}
 +
 +static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
-+					    struct jset *jset,
-+					    struct jset_entry *entry,
-+					    unsigned version, int big_endian, int write)
++				struct jset *jset,
++				struct jset_entry *entry,
++				unsigned version, int big_endian,
++				enum bkey_invalid_flags flags)
 +{
 +	/* obsolete, don't care: */
 +	return 0;
@@ -67226,14 +68124,15 @@ index 000000000..f861ae2f1
 +}
 +
 +static int journal_entry_blacklist_validate(struct bch_fs *c,
-+					    struct jset *jset,
-+					    struct jset_entry *entry,
-+					    unsigned version, int big_endian, int write)
++				struct jset *jset,
++				struct jset_entry *entry,
++				unsigned version, int big_endian,
++				enum bkey_invalid_flags flags)
 +{
 +	int ret = 0;
 +
 +	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
-+				 c, jset, entry,
++				 c, version, jset, entry,
 +		"invalid journal seq blacklist entry: bad size")) {
 +		journal_entry_null_range(entry, vstruct_next(entry));
 +	}
@@ -67251,15 +68150,16 @@ index 000000000..f861ae2f1
 +}
 +
 +static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
-+					       struct jset *jset,
-+					       struct jset_entry *entry,
-+					       unsigned version, int big_endian, int write)
++				struct jset *jset,
++				struct jset_entry *entry,
++				unsigned version, int big_endian,
++				enum bkey_invalid_flags flags)
 +{
 +	struct jset_entry_blacklist_v2 *bl_entry;
 +	int ret = 0;
 +
 +	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
-+				 c, jset, entry,
++				 c, version, jset, entry,
 +		"invalid journal seq blacklist entry: bad size")) {
 +		journal_entry_null_range(entry, vstruct_next(entry));
 +		goto out;
@@ -67269,7 +68169,7 @@ index 000000000..f861ae2f1
 +
 +	if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
 +				 le64_to_cpu(bl_entry->end),
-+				 c, jset, entry,
++				 c, version, jset, entry,
 +		"invalid journal seq blacklist entry: start > end")) {
 +		journal_entry_null_range(entry, vstruct_next(entry));
 +	}
@@ -67290,9 +68190,10 @@ index 000000000..f861ae2f1
 +}
 +
 +static int journal_entry_usage_validate(struct bch_fs *c,
-+					struct jset *jset,
-+					struct jset_entry *entry,
-+					unsigned version, int big_endian, int write)
++				struct jset *jset,
++				struct jset_entry *entry,
++				unsigned version, int big_endian,
++				enum bkey_invalid_flags flags)
 +{
 +	struct jset_entry_usage *u =
 +		container_of(entry, struct jset_entry_usage, entry);
@@ -67300,7 +68201,7 @@ index 000000000..f861ae2f1
 +	int ret = 0;
 +
 +	if (journal_entry_err_on(bytes < sizeof(*u),
-+				 c, jset, entry,
++				 c, version, jset, entry,
 +				 "invalid journal entry usage: bad size")) {
 +		journal_entry_null_range(entry, vstruct_next(entry));
 +		return ret;
@@ -67322,9 +68223,10 @@ index 000000000..f861ae2f1
 +}
 +
 +static int journal_entry_data_usage_validate(struct bch_fs *c,
-+					struct jset *jset,
-+					struct jset_entry *entry,
-+					unsigned version, int big_endian, int write)
++				struct jset *jset,
++				struct jset_entry *entry,
++				unsigned version, int big_endian,
++				enum bkey_invalid_flags flags)
 +{
 +	struct jset_entry_data_usage *u =
 +		container_of(entry, struct jset_entry_data_usage, entry);
@@ -67333,7 +68235,7 @@ index 000000000..f861ae2f1
 +
 +	if (journal_entry_err_on(bytes < sizeof(*u) ||
 +				 bytes < sizeof(*u) + u->r.nr_devs,
-+				 c, jset, entry,
++				 c, version, jset, entry,
 +				 "invalid journal entry usage: bad size")) {
 +		journal_entry_null_range(entry, vstruct_next(entry));
 +		return ret;
@@ -67354,9 +68256,10 @@ index 000000000..f861ae2f1
 +}
 +
 +static int journal_entry_clock_validate(struct bch_fs *c,
-+					struct jset *jset,
-+					struct jset_entry *entry,
-+					unsigned version, int big_endian, int write)
++				struct jset *jset,
++				struct jset_entry *entry,
++				unsigned version, int big_endian,
++				enum bkey_invalid_flags flags)
 +{
 +	struct jset_entry_clock *clock =
 +		container_of(entry, struct jset_entry_clock, entry);
@@ -67364,13 +68267,13 @@ index 000000000..f861ae2f1
 +	int ret = 0;
 +
 +	if (journal_entry_err_on(bytes != sizeof(*clock),
-+				 c, jset, entry, "bad size")) {
++				 c, version, jset, entry, "bad size")) {
 +		journal_entry_null_range(entry, vstruct_next(entry));
 +		return ret;
 +	}
 +
 +	if (journal_entry_err_on(clock->rw > 1,
-+				 c, jset, entry, "bad rw")) {
++				 c, version, jset, entry, "bad rw")) {
 +		journal_entry_null_range(entry, vstruct_next(entry));
 +		return ret;
 +	}
@@ -67389,9 +68292,10 @@ index 000000000..f861ae2f1
 +}
 +
 +static int journal_entry_dev_usage_validate(struct bch_fs *c,
-+					    struct jset *jset,
-+					    struct jset_entry *entry,
-+					    unsigned version, int big_endian, int write)
++				struct jset *jset,
++				struct jset_entry *entry,
++				unsigned version, int big_endian,
++				enum bkey_invalid_flags flags)
 +{
 +	struct jset_entry_dev_usage *u =
 +		container_of(entry, struct jset_entry_dev_usage, entry);
@@ -67401,7 +68305,7 @@ index 000000000..f861ae2f1
 +	int ret = 0;
 +
 +	if (journal_entry_err_on(bytes < expected,
-+				 c, jset, entry, "bad size (%u < %u)",
++				 c, version, jset, entry, "bad size (%u < %u)",
 +				 bytes, expected)) {
 +		journal_entry_null_range(entry, vstruct_next(entry));
 +		return ret;
@@ -67410,13 +68314,13 @@ index 000000000..f861ae2f1
 +	dev = le32_to_cpu(u->dev);
 +
 +	if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
-+				 c, jset, entry, "bad dev")) {
++				 c, version, jset, entry, "bad dev")) {
 +		journal_entry_null_range(entry, vstruct_next(entry));
 +		return ret;
 +	}
 +
 +	if (journal_entry_err_on(u->pad,
-+				 c, jset, entry, "bad pad")) {
++				 c, version, jset, entry, "bad pad")) {
 +		journal_entry_null_range(entry, vstruct_next(entry));
 +		return ret;
 +	}
@@ -67449,9 +68353,10 @@ index 000000000..f861ae2f1
 +}
 +
 +static int journal_entry_log_validate(struct bch_fs *c,
-+				      struct jset *jset,
-+				      struct jset_entry *entry,
-+				      unsigned version, int big_endian, int write)
++				struct jset *jset,
++				struct jset_entry *entry,
++				unsigned version, int big_endian,
++				enum bkey_invalid_flags flags)
 +{
 +	return 0;
 +}
@@ -67466,9 +68371,10 @@ index 000000000..f861ae2f1
 +}
 +
 +static int journal_entry_overwrite_validate(struct bch_fs *c,
-+				      struct jset *jset,
-+				      struct jset_entry *entry,
-+				      unsigned version, int big_endian, int write)
++				struct jset *jset,
++				struct jset_entry *entry,
++				unsigned version, int big_endian,
++				enum bkey_invalid_flags flags)
 +{
 +	return journal_entry_btree_keys_validate(c, jset, entry,
 +				version, big_endian, READ);
@@ -67482,7 +68388,8 @@ index 000000000..f861ae2f1
 +
 +struct jset_entry_ops {
 +	int (*validate)(struct bch_fs *, struct jset *,
-+			struct jset_entry *, unsigned, int, int);
++			struct jset_entry *, unsigned, int,
++			enum bkey_invalid_flags);
 +	void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
 +};
 +
@@ -67499,11 +68406,12 @@ index 000000000..f861ae2f1
 +int bch2_journal_entry_validate(struct bch_fs *c,
 +				struct jset *jset,
 +				struct jset_entry *entry,
-+				unsigned version, int big_endian, int write)
++				unsigned version, int big_endian,
++				enum bkey_invalid_flags flags)
 +{
 +	return entry->type < BCH_JSET_ENTRY_NR
 +		? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
-+				version, big_endian, write)
++				version, big_endian, flags)
 +		: 0;
 +}
 +
@@ -67519,22 +68427,22 @@ index 000000000..f861ae2f1
 +}
 +
 +static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
-+				 int write)
++				 enum bkey_invalid_flags flags)
 +{
 +	struct jset_entry *entry;
++	unsigned version = le32_to_cpu(jset->version);
 +	int ret = 0;
 +
 +	vstruct_for_each(jset, entry) {
-+		if (journal_entry_err_on(vstruct_next(entry) >
-+					 vstruct_last(jset), c, jset, entry,
++		if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
++					 c, version, jset, entry,
 +				"journal entry extends past end of jset")) {
 +			jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
 +			break;
 +		}
 +
 +		ret = bch2_journal_entry_validate(c, jset, entry,
-+					le32_to_cpu(jset->version),
-+					JSET_BIG_ENDIAN(jset), write);
++					version, JSET_BIG_ENDIAN(jset), flags);
 +		if (ret)
 +			break;
 +	}
@@ -67545,7 +68453,7 @@ index 000000000..f861ae2f1
 +static int jset_validate(struct bch_fs *c,
 +			 struct bch_dev *ca,
 +			 struct jset *jset, u64 sector,
-+			 int write)
++			 enum bkey_invalid_flags flags)
 +{
 +	unsigned version;
 +	int ret = 0;
@@ -67554,7 +68462,8 @@ index 000000000..f861ae2f1
 +		return JOURNAL_ENTRY_NONE;
 +
 +	version = le32_to_cpu(jset->version);
-+	if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
++	if (journal_entry_err_on(!bch2_version_compatible(version),
++			c, version, jset, NULL,
 +			"%s sector %llu seq %llu: incompatible journal entry version %u.%u",
 +			ca ? ca->name : c->name,
 +			sector, le64_to_cpu(jset->seq),
@@ -67565,7 +68474,7 @@ index 000000000..f861ae2f1
 +	}
 +
 +	if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
-+				 c, jset, NULL,
++				 c, version, jset, NULL,
 +			"%s sector %llu seq %llu: journal entry with unknown csum type %llu",
 +			ca ? ca->name : c->name,
 +			sector, le64_to_cpu(jset->seq),
@@ -67575,7 +68484,7 @@ index 000000000..f861ae2f1
 +	/* last_seq is ignored when JSET_NO_FLUSH is true */
 +	if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
 +				 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
-+				 c, jset, NULL,
++				 c, version, jset, NULL,
 +				 "invalid journal entry: last_seq > seq (%llu > %llu)",
 +				 le64_to_cpu(jset->last_seq),
 +				 le64_to_cpu(jset->seq))) {
@@ -67583,7 +68492,7 @@ index 000000000..f861ae2f1
 +		return JOURNAL_ENTRY_BAD;
 +	}
 +
-+	ret = jset_validate_entries(c, jset, write);
++	ret = jset_validate_entries(c, jset, flags);
 +fsck_err:
 +	return ret;
 +}
@@ -67596,14 +68505,15 @@ index 000000000..f861ae2f1
 +{
 +	size_t bytes = vstruct_bytes(jset);
 +	unsigned version;
-+	int write = READ;
++	enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
 +	int ret = 0;
 +
 +	if (le64_to_cpu(jset->magic) != jset_magic(c))
 +		return JOURNAL_ENTRY_NONE;
 +
 +	version = le32_to_cpu(jset->version);
-+	if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
++	if (journal_entry_err_on(!bch2_version_compatible(version),
++			 c, version, jset, NULL,
 +			"%s sector %llu seq %llu: unknown journal entry version %u.%u",
 +			ca ? ca->name : c->name,
 +			sector, le64_to_cpu(jset->seq),
@@ -67618,7 +68528,7 @@ index 000000000..f861ae2f1
 +		return JOURNAL_ENTRY_REREAD;
 +
 +	if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
-+				 c, jset, NULL,
++			 c, version, jset, NULL,
 +			"%s sector %llu seq %llu: journal entry too big (%zu bytes)",
 +			ca ? ca->name : c->name,
 +			sector, le64_to_cpu(jset->seq), bytes))
@@ -67839,6 +68749,7 @@ index 000000000..f861ae2f1
 +		bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
 +		for (i = 0; i < 3; i++) {
 +			unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
++
 +			bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
 +		}
 +		ja->sectors_free = 0;
@@ -67935,7 +68846,7 @@ index 000000000..f861ae2f1
 +	 * those entries will be blacklisted:
 +	 */
 +	genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
-+		int write = READ;
++		enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
 +
 +		i = *_i;
 +
@@ -67957,7 +68868,7 @@ index 000000000..f861ae2f1
 +		}
 +
 +		if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
-+					 c, &i->j, NULL,
++					 c, le32_to_cpu(i->j.version), &i->j, NULL,
 +					 "invalid journal entry: last_seq > seq (%llu > %llu)",
 +					 le64_to_cpu(i->j.last_seq),
 +					 le64_to_cpu(i->j.seq)))
@@ -68089,18 +69000,14 @@ index 000000000..f861ae2f1
 +
 +		bch2_replicas_entry_sort(&replicas.e);
 +
-+		/*
-+		 * If we're mounting in degraded mode - if we didn't read all
-+		 * the devices - this is wrong:
-+		 */
-+
 +		printbuf_reset(&buf);
 +		bch2_replicas_entry_to_text(&buf, &replicas.e);
 +
 +		if (!degraded &&
-+		    fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
-+				"superblock not marked as containing replicas %s",
-+				buf.buf)) {
++		    !bch2_replicas_marked(c, &replicas.e) &&
++		    (le64_to_cpu(i->j.seq) == *last_seq ||
++		     fsck_err(c, "superblock not marked as containing replicas for journal entry %llu\n  %s",
++			      le64_to_cpu(i->j.seq), buf.buf))) {
 +			ret = bch2_mark_replicas(c, &replicas.e);
 +			if (ret)
 +				goto err;
@@ -68267,6 +69174,7 @@ index 000000000..f861ae2f1
 +	struct journal *j = container_of(cl, struct journal, io);
 +	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 +	struct journal_buf *w = journal_last_unwritten_buf(j);
++	struct bch_replicas_padded replicas;
 +	union journal_res_state old, new;
 +	u64 v, seq;
 +	int err = 0;
@@ -68278,7 +69186,13 @@ index 000000000..f861ae2f1
 +	if (!w->devs_written.nr) {
 +		bch_err(c, "unable to write journal to sufficient devices");
 +		err = -EIO;
++	} else {
++		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
++					 w->devs_written);
++		if (bch2_mark_replicas(c, &replicas.e))
++			err = -EIO;
 +	}
++
 +	if (err)
 +		bch2_fatal_error(c);
 +
@@ -68415,7 +69329,6 @@ index 000000000..f861ae2f1
 +	}
 +
 +	continue_at(cl, journal_write_done, c->io_complete_wq);
-+	return;
 +}
 +
 +static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset)
@@ -68671,10 +69584,10 @@ index 000000000..f861ae2f1
 +}
 diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
 new file mode 100644
-index 000000000..8801e9810
+index 000000000..a88d097b1
 --- /dev/null
 +++ b/fs/bcachefs/journal_io.h
-@@ -0,0 +1,64 @@
+@@ -0,0 +1,65 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_JOURNAL_IO_H
 +#define _BCACHEFS_JOURNAL_IO_H
@@ -68727,7 +69640,8 @@ index 000000000..8801e9810
 +		jset_entry_for_each_key(entry, k)
 +
 +int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
-+				struct jset_entry *, unsigned, int, int);
++				struct jset_entry *, unsigned, int,
++				enum bkey_invalid_flags);
 +void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
 +				struct jset_entry *);
 +
@@ -68741,22 +69655,23 @@ index 000000000..8801e9810
 +#endif /* _BCACHEFS_JOURNAL_IO_H */
 diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
 new file mode 100644
-index 000000000..8de83e103
+index 000000000..10e1860da
 --- /dev/null
 +++ b/fs/bcachefs/journal_reclaim.c
-@@ -0,0 +1,873 @@
+@@ -0,0 +1,874 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
 +#include "btree_key_cache.h"
 +#include "btree_update.h"
++#include "buckets.h"
 +#include "errcode.h"
 +#include "error.h"
 +#include "journal.h"
 +#include "journal_io.h"
 +#include "journal_reclaim.h"
 +#include "replicas.h"
-+#include "super.h"
++#include "sb-members.h"
 +#include "trace.h"
 +
 +#include <linux/kthread.h>
@@ -69092,7 +70007,7 @@ index 000000000..8de83e103
 +	list_del_init(&pin->list);
 +
 +	/*
-+	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
++	 * Unpinning a journal entry may make journal_next_bucket() succeed, if
 +	 * writing a new last_seq will now make another bucket available:
 +	 */
 +	return atomic_dec_and_test(&pin_list->count) &&
@@ -71278,10 +72193,10 @@ index 000000000..027efaa0d
 +#endif /* _BCACHEFS_MIGRATE_H */
 diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
 new file mode 100644
-index 000000000..052726739
+index 000000000..fb76a1dac
 --- /dev/null
 +++ b/fs/bcachefs/move.c
-@@ -0,0 +1,1168 @@
+@@ -0,0 +1,1162 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -72387,46 +73302,40 @@ index 000000000..052726739
 +	return ret;
 +}
 +
-+void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c)
-+{
-+	struct bch_move_stats *stats;
-+
-+	mutex_lock(&c->data_progress_lock);
-+	list_for_each_entry(stats, &c->data_progress_list, list) {
-+		prt_printf(out, "%s: data type %s btree_id %s position: ",
-+		       stats->name,
-+		       bch2_data_types[stats->data_type],
-+		       bch2_btree_ids[stats->btree_id]);
-+		bch2_bpos_to_text(out, stats->pos);
-+		prt_printf(out, "%s", "\n");
-+	}
-+	mutex_unlock(&c->data_progress_lock);
-+}
-+
-+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt)
++static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
 +{
++	struct bch_move_stats *stats = ctxt->stats;
 +	struct moving_io *io;
 +
-+	prt_printf(out, "%ps:", ctxt->fn);
++	prt_printf(out, "%s (%ps):", stats->name, ctxt->fn);
++	prt_newline(out);
++
++	prt_printf(out, " data type %s btree_id %s position: ",
++		   bch2_data_types[stats->data_type],
++		   bch2_btree_ids[stats->btree_id]);
++	bch2_bpos_to_text(out, stats->pos);
 +	prt_newline(out);
 +	printbuf_indent_add(out, 2);
 +
-+	prt_printf(out, "reads: %u sectors %u",
++	prt_printf(out, "reads: ios %u/%u sectors %u/%u",
 +		   atomic_read(&ctxt->read_ios),
-+		   atomic_read(&ctxt->read_sectors));
++		   c->opts.move_ios_in_flight,
++		   atomic_read(&ctxt->read_sectors),
++		   c->opts.move_bytes_in_flight >> 9);
 +	prt_newline(out);
 +
-+	prt_printf(out, "writes: %u sectors %u",
++	prt_printf(out, "writes: ios %u/%u sectors %u/%u",
 +		   atomic_read(&ctxt->write_ios),
-+		   atomic_read(&ctxt->write_sectors));
++		   c->opts.move_ios_in_flight,
++		   atomic_read(&ctxt->write_sectors),
++		   c->opts.move_bytes_in_flight >> 9);
 +	prt_newline(out);
 +
 +	printbuf_indent_add(out, 2);
 +
 +	mutex_lock(&ctxt->lock);
-+	list_for_each_entry(io, &ctxt->ios, io_list) {
++	list_for_each_entry(io, &ctxt->ios, io_list)
 +		bch2_write_op_to_text(out, &io->write.op);
-+	}
 +	mutex_unlock(&ctxt->lock);
 +
 +	printbuf_indent_sub(out, 4);
@@ -72438,7 +73347,7 @@ index 000000000..052726739
 +
 +	mutex_lock(&c->moving_context_lock);
 +	list_for_each_entry(ctxt, &c->moving_context_list, list)
-+		bch2_moving_ctxt_to_text(out, ctxt);
++		bch2_moving_ctxt_to_text(out, c, ctxt);
 +	mutex_unlock(&c->moving_context_lock);
 +}
 +
@@ -72452,10 +73361,10 @@ index 000000000..052726739
 +}
 diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
 new file mode 100644
-index 000000000..547ee7b72
+index 000000000..c3136abe8
 --- /dev/null
 +++ b/fs/bcachefs/move.h
-@@ -0,0 +1,96 @@
+@@ -0,0 +1,95 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_MOVE_H
 +#define _BCACHEFS_MOVE_H
@@ -72546,7 +73455,6 @@ index 000000000..547ee7b72
 +		  struct bch_ioctl_data);
 +
 +void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
-+void bch2_data_jobs_to_text(struct printbuf *, struct bch_fs *);
 +void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
 +
 +void bch2_fs_move_init(struct bch_fs *);
@@ -72596,10 +73504,10 @@ index 000000000..baf1f8570
 +#endif /* _BCACHEFS_MOVE_TYPES_H */
 diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
 new file mode 100644
-index 000000000..5242f20bb
+index 000000000..256431a6d
 --- /dev/null
 +++ b/fs/bcachefs/movinggc.c
-@@ -0,0 +1,421 @@
+@@ -0,0 +1,423 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * Moving/copying garbage collector
@@ -72822,8 +73730,10 @@ index 000000000..5242f20bb
 +
 +		f = move_bucket_in_flight_add(buckets_in_flight, *i);
 +		ret = PTR_ERR_OR_ZERO(f);
-+		if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */
++		if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */
++			ret = 0;
 +			continue;
++		}
 +		if (ret == -ENOMEM) { /* flush IO, continue later */
 +			ret = 0;
 +			break;
@@ -73251,10 +74161,10 @@ index 000000000..bd12bf677
 +
 diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
 new file mode 100644
-index 000000000..9dcc61ee5
+index 000000000..960bb247f
 --- /dev/null
 +++ b/fs/bcachefs/opts.c
-@@ -0,0 +1,592 @@
+@@ -0,0 +1,599 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include <linux/kernel.h>
@@ -73267,7 +74177,7 @@ index 000000000..9dcc61ee5
 +#include "super-io.h"
 +#include "util.h"
 +
-+#define x(t, n) [n] = #t,
++#define x(t, n, ...) [n] = #t,
 +
 +const char * const bch2_error_actions[] = {
 +	BCH_ERROR_ACTIONS()
@@ -73352,8 +74262,8 @@ index 000000000..9dcc61ee5
 +
 +#undef x
 +
-+int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
-+			      struct printbuf *err)
++static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
++				     struct printbuf *err)
 +{
 +	if (!val) {
 +		*res = FSCK_FIX_yes;
@@ -73370,18 +74280,18 @@ index 000000000..9dcc61ee5
 +	return 0;
 +}
 +
-+void bch2_opt_fix_errors_to_text(struct printbuf *out,
-+				 struct bch_fs *c,
-+				 struct bch_sb *sb,
-+				 u64 v)
++static void bch2_opt_fix_errors_to_text(struct printbuf *out,
++					struct bch_fs *c,
++					struct bch_sb *sb,
++					u64 v)
 +{
 +	prt_str(out, bch2_fsck_fix_opts[v]);
 +}
 +
-+static const struct bch_opt_fn bch2_opt_fix_errors = {
-+	.parse = bch2_opt_fix_errors_parse,
-+	.to_text = bch2_opt_fix_errors_to_text,
-+};
++#define bch2_opt_fix_errors (struct bch_opt_fn) {	\
++	.parse = bch2_opt_fix_errors_parse,		\
++	.to_text = bch2_opt_fix_errors_to_text,		\
++}
 +
 +const char * const bch2_d_types[BCH_DT_MAX] = {
 +	[DT_UNKNOWN]	= "unknown",
@@ -73702,6 +74612,13 @@ index 000000000..9dcc61ee5
 +	if (!options)
 +		return 0;
 +
++	/*
++	 * sys_fsconfig() is now occasionally providing us with option lists
++	 * starting with a comma - weird.
++	 */
++	if (*options == ',')
++		options++;
++
 +	copied_opts = kstrdup(options, GFP_KERNEL);
 +	if (!copied_opts)
 +		return -1;
@@ -75129,7 +76046,7 @@ index 000000000..2191423d9
 +#endif /* _BCACHEFS_PRINTBUF_H */
 diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
 new file mode 100644
-index 000000000..4f0654ff8
+index 000000000..ca99772ae
 --- /dev/null
 +++ b/fs/bcachefs/quota.c
 @@ -0,0 +1,981 @@
@@ -75140,7 +76057,7 @@ index 000000000..4f0654ff8
 +#include "error.h"
 +#include "inode.h"
 +#include "quota.h"
-+#include "subvolume.h"
++#include "snapshot.h"
 +#include "super-io.h"
 +
 +static const char * const bch2_quota_types[] = {
@@ -76245,10 +77162,10 @@ index 000000000..6a136083d
 +#endif /* _BCACHEFS_QUOTA_TYPES_H */
 diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
 new file mode 100644
-index 000000000..c3d577236
+index 000000000..15ce3ecba
 --- /dev/null
 +++ b/fs/bcachefs/rebalance.c
-@@ -0,0 +1,364 @@
+@@ -0,0 +1,368 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -76364,6 +77281,10 @@ index 000000000..c3d577236
 +	unsigned percent_full;
 +	u64 work = dev_work + unknown_dev;
 +
++	/* avoid divide by 0 */
++	if (!capacity)
++		return;
++
 +	if (work < dev_work || work < unknown_dev)
 +		work = U64_MAX;
 +	work = min(work, capacity);
@@ -76681,10 +77602,10 @@ index 000000000..7462a92e9
 +#endif /* _BCACHEFS_REBALANCE_TYPES_H */
 diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
 new file mode 100644
-index 000000000..dcd4f9f41
+index 000000000..30efb3c90
 --- /dev/null
 +++ b/fs/bcachefs/recovery.c
-@@ -0,0 +1,1670 @@
+@@ -0,0 +1,1057 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -76692,6 +77613,7 @@ index 000000000..dcd4f9f41
 +#include "bkey_buf.h"
 +#include "alloc_background.h"
 +#include "btree_gc.h"
++#include "btree_journal_iter.h"
 +#include "btree_update.h"
 +#include "btree_update_interior.h"
 +#include "btree_io.h"
@@ -76710,6 +77632,8 @@ index 000000000..dcd4f9f41
 +#include "quota.h"
 +#include "recovery.h"
 +#include "replicas.h"
++#include "sb-clean.h"
++#include "snapshot.h"
 +#include "subvolume.h"
 +#include "super-io.h"
 +
@@ -76744,524 +77668,6 @@ index 000000000..dcd4f9f41
 +			bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
 +}
 +
-+/* iterate over keys read from the journal: */
-+
-+static int __journal_key_cmp(enum btree_id	l_btree_id,
-+			     unsigned		l_level,
-+			     struct bpos	l_pos,
-+			     const struct journal_key *r)
-+{
-+	return (cmp_int(l_btree_id,	r->btree_id) ?:
-+		cmp_int(l_level,	r->level) ?:
-+		bpos_cmp(l_pos,	r->k->k.p));
-+}
-+
-+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
-+{
-+	return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
-+}
-+
-+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
-+{
-+	size_t gap_size = keys->size - keys->nr;
-+
-+	if (idx >= keys->gap)
-+		idx += gap_size;
-+	return idx;
-+}
-+
-+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
-+{
-+	return keys->d + idx_to_pos(keys, idx);
-+}
-+
-+static size_t __bch2_journal_key_search(struct journal_keys *keys,
-+					enum btree_id id, unsigned level,
-+					struct bpos pos)
-+{
-+	size_t l = 0, r = keys->nr, m;
-+
-+	while (l < r) {
-+		m = l + ((r - l) >> 1);
-+		if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
-+			l = m + 1;
-+		else
-+			r = m;
-+	}
-+
-+	BUG_ON(l < keys->nr &&
-+	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
-+
-+	BUG_ON(l &&
-+	       __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
-+
-+	return l;
-+}
-+
-+static size_t bch2_journal_key_search(struct journal_keys *keys,
-+				      enum btree_id id, unsigned level,
-+				      struct bpos pos)
-+{
-+	return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
-+}
-+
-+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
-+					   unsigned level, struct bpos pos,
-+					   struct bpos end_pos, size_t *idx)
-+{
-+	struct journal_keys *keys = &c->journal_keys;
-+	unsigned iters = 0;
-+	struct journal_key *k;
-+search:
-+	if (!*idx)
-+		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
-+
-+	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
-+		if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
-+			return NULL;
-+
-+		if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
-+		    !k->overwritten)
-+			return k->k;
-+
-+		(*idx)++;
-+		iters++;
-+		if (iters == 10) {
-+			*idx = 0;
-+			goto search;
-+		}
-+	}
-+
-+	return NULL;
-+}
-+
-+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
-+					   unsigned level, struct bpos pos)
-+{
-+	size_t idx = 0;
-+
-+	return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
-+}
-+
-+static void journal_iters_fix(struct bch_fs *c)
-+{
-+	struct journal_keys *keys = &c->journal_keys;
-+	/* The key we just inserted is immediately before the gap: */
-+	size_t gap_end = keys->gap + (keys->size - keys->nr);
-+	struct btree_and_journal_iter *iter;
-+
-+	/*
-+	 * If an iterator points one after the key we just inserted, decrement
-+	 * the iterator so it points at the key we just inserted - if the
-+	 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
-+	 * handle that:
-+	 */
-+	list_for_each_entry(iter, &c->journal_iters, journal.list)
-+		if (iter->journal.idx == gap_end)
-+			iter->journal.idx = keys->gap - 1;
-+}
-+
-+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
-+{
-+	struct journal_keys *keys = &c->journal_keys;
-+	struct journal_iter *iter;
-+	size_t gap_size = keys->size - keys->nr;
-+
-+	list_for_each_entry(iter, &c->journal_iters, list) {
-+		if (iter->idx > old_gap)
-+			iter->idx -= gap_size;
-+		if (iter->idx >= new_gap)
-+			iter->idx += gap_size;
-+	}
-+}
-+
-+int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
-+				 unsigned level, struct bkey_i *k)
-+{
-+	struct journal_key n = {
-+		.btree_id	= id,
-+		.level		= level,
-+		.k		= k,
-+		.allocated	= true,
-+		/*
-+		 * Ensure these keys are done last by journal replay, to unblock
-+		 * journal reclaim:
-+		 */
-+		.journal_seq	= U32_MAX,
-+	};
-+	struct journal_keys *keys = &c->journal_keys;
-+	size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
-+
-+	BUG_ON(test_bit(BCH_FS_RW, &c->flags));
-+
-+	if (idx < keys->size &&
-+	    journal_key_cmp(&n, &keys->d[idx]) == 0) {
-+		if (keys->d[idx].allocated)
-+			kfree(keys->d[idx].k);
-+		keys->d[idx] = n;
-+		return 0;
-+	}
-+
-+	if (idx > keys->gap)
-+		idx -= keys->size - keys->nr;
-+
-+	if (keys->nr == keys->size) {
-+		struct journal_keys new_keys = {
-+			.nr			= keys->nr,
-+			.size			= max_t(size_t, keys->size, 8) * 2,
-+		};
-+
-+		new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
-+		if (!new_keys.d) {
-+			bch_err(c, "%s: error allocating new key array (size %zu)",
-+				__func__, new_keys.size);
-+			return -BCH_ERR_ENOMEM_journal_key_insert;
-+		}
-+
-+		/* Since @keys was full, there was no gap: */
-+		memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
-+		kvfree(keys->d);
-+		*keys = new_keys;
-+
-+		/* And now the gap is at the end: */
-+		keys->gap = keys->nr;
-+	}
-+
-+	journal_iters_move_gap(c, keys->gap, idx);
-+
-+	move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
-+	keys->gap = idx;
-+
-+	keys->nr++;
-+	keys->d[keys->gap++] = n;
-+
-+	journal_iters_fix(c);
-+
-+	return 0;
-+}
-+
-+/*
-+ * Can only be used from the recovery thread while we're still RO - can't be
-+ * used once we've got RW, as journal_keys is at that point used by multiple
-+ * threads:
-+ */
-+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
-+			    unsigned level, struct bkey_i *k)
-+{
-+	struct bkey_i *n;
-+	int ret;
-+
-+	n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
-+	if (!n)
-+		return -BCH_ERR_ENOMEM_journal_key_insert;
-+
-+	bkey_copy(n, k);
-+	ret = bch2_journal_key_insert_take(c, id, level, n);
-+	if (ret)
-+		kfree(n);
-+	return ret;
-+}
-+
-+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
-+			    unsigned level, struct bpos pos)
-+{
-+	struct bkey_i whiteout;
-+
-+	bkey_init(&whiteout.k);
-+	whiteout.k.p = pos;
-+
-+	return bch2_journal_key_insert(c, id, level, &whiteout);
-+}
-+
-+void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
-+				  unsigned level, struct bpos pos)
-+{
-+	struct journal_keys *keys = &c->journal_keys;
-+	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-+
-+	if (idx < keys->size &&
-+	    keys->d[idx].btree_id	== btree &&
-+	    keys->d[idx].level		== level &&
-+	    bpos_eq(keys->d[idx].k->k.p, pos))
-+		keys->d[idx].overwritten = true;
-+}
-+
-+static void bch2_journal_iter_advance(struct journal_iter *iter)
-+{
-+	if (iter->idx < iter->keys->size) {
-+		iter->idx++;
-+		if (iter->idx == iter->keys->gap)
-+			iter->idx += iter->keys->size - iter->keys->nr;
-+	}
-+}
-+
-+static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
-+{
-+	struct journal_key *k = iter->keys->d + iter->idx;
-+
-+	while (k < iter->keys->d + iter->keys->size &&
-+	       k->btree_id	== iter->btree_id &&
-+	       k->level		== iter->level) {
-+		if (!k->overwritten)
-+			return bkey_i_to_s_c(k->k);
-+
-+		bch2_journal_iter_advance(iter);
-+		k = iter->keys->d + iter->idx;
-+	}
-+
-+	return bkey_s_c_null;
-+}
-+
-+static void bch2_journal_iter_exit(struct journal_iter *iter)
-+{
-+	list_del(&iter->list);
-+}
-+
-+static void bch2_journal_iter_init(struct bch_fs *c,
-+				   struct journal_iter *iter,
-+				   enum btree_id id, unsigned level,
-+				   struct bpos pos)
-+{
-+	iter->btree_id	= id;
-+	iter->level	= level;
-+	iter->keys	= &c->journal_keys;
-+	iter->idx	= bch2_journal_key_search(&c->journal_keys, id, level, pos);
-+}
-+
-+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
-+{
-+	return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
-+						iter->b, &iter->unpacked);
-+}
-+
-+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
-+{
-+	bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
-+}
-+
-+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
-+{
-+	if (bpos_eq(iter->pos, SPOS_MAX))
-+		iter->at_end = true;
-+	else
-+		iter->pos = bpos_successor(iter->pos);
-+}
-+
-+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
-+{
-+	struct bkey_s_c btree_k, journal_k, ret;
-+again:
-+	if (iter->at_end)
-+		return bkey_s_c_null;
-+
-+	while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
-+	       bpos_lt(btree_k.k->p, iter->pos))
-+		bch2_journal_iter_advance_btree(iter);
-+
-+	while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
-+	       bpos_lt(journal_k.k->p, iter->pos))
-+		bch2_journal_iter_advance(&iter->journal);
-+
-+	ret = journal_k.k &&
-+		(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
-+		? journal_k
-+		: btree_k;
-+
-+	if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
-+		ret = bkey_s_c_null;
-+
-+	if (ret.k) {
-+		iter->pos = ret.k->p;
-+		if (bkey_deleted(ret.k)) {
-+			bch2_btree_and_journal_iter_advance(iter);
-+			goto again;
-+		}
-+	} else {
-+		iter->pos = SPOS_MAX;
-+		iter->at_end = true;
-+	}
-+
-+	return ret;
-+}
-+
-+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
-+{
-+	bch2_journal_iter_exit(&iter->journal);
-+}
-+
-+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-+						  struct bch_fs *c,
-+						  struct btree *b,
-+						  struct btree_node_iter node_iter,
-+						  struct bpos pos)
-+{
-+	memset(iter, 0, sizeof(*iter));
-+
-+	iter->b = b;
-+	iter->node_iter = node_iter;
-+	bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
-+	INIT_LIST_HEAD(&iter->journal.list);
-+	iter->pos = b->data->min_key;
-+	iter->at_end = false;
-+}
-+
-+/*
-+ * this version is used by btree_gc before filesystem has gone RW and
-+ * multithreaded, so uses the journal_iters list:
-+ */
-+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-+						struct bch_fs *c,
-+						struct btree *b)
-+{
-+	struct btree_node_iter node_iter;
-+
-+	bch2_btree_node_iter_init_from_start(&node_iter, b);
-+	__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
-+	list_add(&iter->journal.list, &c->journal_iters);
-+}
-+
-+/* sort and dedup all keys in the journal: */
-+
-+void bch2_journal_entries_free(struct bch_fs *c)
-+{
-+	struct journal_replay **i;
-+	struct genradix_iter iter;
-+
-+	genradix_for_each(&c->journal_entries, iter, i)
-+		if (*i)
-+			kvpfree(*i, offsetof(struct journal_replay, j) +
-+				vstruct_bytes(&(*i)->j));
-+	genradix_free(&c->journal_entries);
-+}
-+
-+/*
-+ * When keys compare equal, oldest compares first:
-+ */
-+static int journal_sort_key_cmp(const void *_l, const void *_r)
-+{
-+	const struct journal_key *l = _l;
-+	const struct journal_key *r = _r;
-+
-+	return  journal_key_cmp(l, r) ?:
-+		cmp_int(l->journal_seq, r->journal_seq) ?:
-+		cmp_int(l->journal_offset, r->journal_offset);
-+}
-+
-+void bch2_journal_keys_free(struct journal_keys *keys)
-+{
-+	struct journal_key *i;
-+
-+	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
-+	keys->gap = keys->nr;
-+
-+	for (i = keys->d; i < keys->d + keys->nr; i++)
-+		if (i->allocated)
-+			kfree(i->k);
-+
-+	kvfree(keys->d);
-+	keys->d = NULL;
-+	keys->nr = keys->gap = keys->size = 0;
-+}
-+
-+static void __journal_keys_sort(struct journal_keys *keys)
-+{
-+	struct journal_key *src, *dst;
-+
-+	sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
-+
-+	src = dst = keys->d;
-+	while (src < keys->d + keys->nr) {
-+		while (src + 1 < keys->d + keys->nr &&
-+		       src[0].btree_id	== src[1].btree_id &&
-+		       src[0].level	== src[1].level &&
-+		       bpos_eq(src[0].k->k.p, src[1].k->k.p))
-+			src++;
-+
-+		*dst++ = *src++;
-+	}
-+
-+	keys->nr = dst - keys->d;
-+}
-+
-+static int journal_keys_sort(struct bch_fs *c)
-+{
-+	struct genradix_iter iter;
-+	struct journal_replay *i, **_i;
-+	struct jset_entry *entry;
-+	struct bkey_i *k;
-+	struct journal_keys *keys = &c->journal_keys;
-+	size_t nr_keys = 0, nr_read = 0;
-+
-+	genradix_for_each(&c->journal_entries, iter, _i) {
-+		i = *_i;
-+
-+		if (!i || i->ignore)
-+			continue;
-+
-+		for_each_jset_key(k, entry, &i->j)
-+			nr_keys++;
-+	}
-+
-+	if (!nr_keys)
-+		return 0;
-+
-+	keys->size = roundup_pow_of_two(nr_keys);
-+
-+	keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
-+	if (!keys->d) {
-+		bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
-+			nr_keys);
-+
-+		do {
-+			keys->size >>= 1;
-+			keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
-+		} while (!keys->d && keys->size > nr_keys / 8);
-+
-+		if (!keys->d) {
-+			bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
-+				keys->size);
-+			return -BCH_ERR_ENOMEM_journal_keys_sort;
-+		}
-+	}
-+
-+	genradix_for_each(&c->journal_entries, iter, _i) {
-+		i = *_i;
-+
-+		if (!i || i->ignore)
-+			continue;
-+
-+		cond_resched();
-+
-+		for_each_jset_key(k, entry, &i->j) {
-+			if (keys->nr == keys->size) {
-+				__journal_keys_sort(keys);
-+
-+				if (keys->nr > keys->size * 7 / 8) {
-+					bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
-+						keys->nr, keys->size, nr_read, nr_keys);
-+					return -BCH_ERR_ENOMEM_journal_keys_sort;
-+				}
-+			}
-+
-+			keys->d[keys->nr++] = (struct journal_key) {
-+				.btree_id	= entry->btree_id,
-+				.level		= entry->level,
-+				.k		= k,
-+				.journal_seq	= le64_to_cpu(i->j.seq),
-+				.journal_offset	= k->_data - i->j._data,
-+			};
-+
-+			nr_read++;
-+		}
-+	}
-+
-+	__journal_keys_sort(keys);
-+	keys->gap = keys->nr;
-+
-+	bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
-+	return 0;
-+}
-+
 +/* journal replay: */
 +
 +static void replay_now_at(struct journal *j, u64 seq)
@@ -77335,7 +77741,7 @@ index 000000000..dcd4f9f41
 +	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
 +	keys->gap = keys->nr;
 +
-+	keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
++	keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL);
 +	if (!keys_sorted)
 +		return -BCH_ERR_ENOMEM_journal_replay;
 +
@@ -77533,134 +77939,6 @@ index 000000000..dcd4f9f41
 +
 +/* sb clean section: */
 +
-+static struct bkey_i *btree_root_find(struct bch_fs *c,
-+				      struct bch_sb_field_clean *clean,
-+				      struct jset *j,
-+				      enum btree_id id, unsigned *level)
-+{
-+	struct bkey_i *k;
-+	struct jset_entry *entry, *start, *end;
-+
-+	if (clean) {
-+		start = clean->start;
-+		end = vstruct_end(&clean->field);
-+	} else {
-+		start = j->start;
-+		end = vstruct_last(j);
-+	}
-+
-+	for (entry = start; entry < end; entry = vstruct_next(entry))
-+		if (entry->type == BCH_JSET_ENTRY_btree_root &&
-+		    entry->btree_id == id)
-+			goto found;
-+
-+	return NULL;
-+found:
-+	if (!entry->u64s)
-+		return ERR_PTR(-EINVAL);
-+
-+	k = entry->start;
-+	*level = entry->level;
-+	return k;
-+}
-+
-+static int verify_superblock_clean(struct bch_fs *c,
-+				   struct bch_sb_field_clean **cleanp,
-+				   struct jset *j)
-+{
-+	unsigned i;
-+	struct bch_sb_field_clean *clean = *cleanp;
-+	struct printbuf buf1 = PRINTBUF;
-+	struct printbuf buf2 = PRINTBUF;
-+	int ret = 0;
-+
-+	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
-+			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
-+			le64_to_cpu(clean->journal_seq),
-+			le64_to_cpu(j->seq))) {
-+		kfree(clean);
-+		*cleanp = NULL;
-+		return 0;
-+	}
-+
-+	for (i = 0; i < BTREE_ID_NR; i++) {
-+		struct bkey_i *k1, *k2;
-+		unsigned l1 = 0, l2 = 0;
-+
-+		k1 = btree_root_find(c, clean, NULL, i, &l1);
-+		k2 = btree_root_find(c, NULL, j, i, &l2);
-+
-+		if (!k1 && !k2)
-+			continue;
-+
-+		printbuf_reset(&buf1);
-+		printbuf_reset(&buf2);
-+
-+		if (k1)
-+			bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
-+		else
-+			prt_printf(&buf1, "(none)");
-+
-+		if (k2)
-+			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
-+		else
-+			prt_printf(&buf2, "(none)");
-+
-+		mustfix_fsck_err_on(!k1 || !k2 ||
-+				    IS_ERR(k1) ||
-+				    IS_ERR(k2) ||
-+				    k1->k.u64s != k2->k.u64s ||
-+				    memcmp(k1, k2, bkey_bytes(&k1->k)) ||
-+				    l1 != l2, c,
-+			"superblock btree root %u doesn't match journal after clean shutdown\n"
-+			"sb:      l=%u %s\n"
-+			"journal: l=%u %s\n", i,
-+			l1, buf1.buf,
-+			l2, buf2.buf);
-+	}
-+fsck_err:
-+	printbuf_exit(&buf2);
-+	printbuf_exit(&buf1);
-+	return ret;
-+}
-+
-+static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
-+{
-+	struct bch_sb_field_clean *clean, *sb_clean;
-+	int ret;
-+
-+	mutex_lock(&c->sb_lock);
-+	sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
-+
-+	if (fsck_err_on(!sb_clean, c,
-+			"superblock marked clean but clean section not present")) {
-+		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-+		c->sb.clean = false;
-+		mutex_unlock(&c->sb_lock);
-+		return NULL;
-+	}
-+
-+	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
-+			GFP_KERNEL);
-+	if (!clean) {
-+		mutex_unlock(&c->sb_lock);
-+		return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
-+	}
-+
-+	ret = bch2_sb_clean_validate_late(c, clean, READ);
-+	if (ret) {
-+		mutex_unlock(&c->sb_lock);
-+		return ERR_PTR(ret);
-+	}
-+
-+	mutex_unlock(&c->sb_lock);
-+
-+	return clean;
-+fsck_err:
-+	mutex_unlock(&c->sb_lock);
-+	return ERR_PTR(ret);
-+}
-+
 +static bool btree_id_is_alloc(enum btree_id id)
 +{
 +	switch (id) {
@@ -77807,9 +78085,38 @@ index 000000000..dcd4f9f41
 +	return ret;
 +}
 +
++const char * const bch2_recovery_passes[] = {
++#define x(_fn, _when)	#_fn,
++	BCH_RECOVERY_PASSES()
++#undef x
++	NULL
++};
++
++static int bch2_check_allocations(struct bch_fs *c)
++{
++	return bch2_gc(c, true, c->opts.norecovery);
++}
++
++static int bch2_set_may_go_rw(struct bch_fs *c)
++{
++	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
++	return 0;
++}
++
++struct recovery_pass_fn {
++	int		(*fn)(struct bch_fs *);
++	unsigned	when;
++};
++
++static struct recovery_pass_fn recovery_pass_fns[] = {
++#define x(_fn, _when)	{ .fn = bch2_##_fn, .when = _when },
++	BCH_RECOVERY_PASSES()
++#undef x
++};
++
 +static void check_version_upgrade(struct bch_fs *c)
 +{
-+	unsigned latest_compatible = bch2_version_compatible(c->sb.version);
++	unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version);
 +	unsigned latest_version	= bcachefs_metadata_version_current;
 +	unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
 +	unsigned new_version = 0;
@@ -77859,7 +78166,12 @@ index 000000000..dcd4f9f41
 +
 +		recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version);
 +		if (recovery_passes) {
-+			prt_str(&buf, "fsck required");
++			if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
++				prt_str(&buf, "fsck required");
++			else {
++				prt_str(&buf, "running recovery passses: ");
++				prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
++			}
 +
 +			c->recovery_passes_explicit |= recovery_passes;
 +			c->opts.fix_errors = FSCK_FIX_yes;
@@ -77875,42 +78187,19 @@ index 000000000..dcd4f9f41
 +	}
 +}
 +
-+static int bch2_check_allocations(struct bch_fs *c)
-+{
-+	return bch2_gc(c, true, c->opts.norecovery);
-+}
-+
-+static int bch2_set_may_go_rw(struct bch_fs *c)
-+{
-+	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
-+	return 0;
-+}
-+
-+struct recovery_pass_fn {
-+	int		(*fn)(struct bch_fs *);
-+	const char	*name;
-+	unsigned	when;
-+};
-+
-+static struct recovery_pass_fn recovery_passes[] = {
-+#define x(_fn, _when)	{ .fn = bch2_##_fn, .name = #_fn, .when = _when },
-+	BCH_RECOVERY_PASSES()
-+#undef x
-+};
-+
 +u64 bch2_fsck_recovery_passes(void)
 +{
 +	u64 ret = 0;
 +
-+	for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++)
-+		if (recovery_passes[i].when & PASS_FSCK)
++	for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
++		if (recovery_pass_fns[i].when & PASS_FSCK)
 +			ret |= BIT_ULL(i);
 +	return ret;
 +}
 +
 +static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
 +{
-+	struct recovery_pass_fn *p = recovery_passes + c->curr_recovery_pass;
++	struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass;
 +
 +	if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
 +		return false;
@@ -77932,15 +78221,18 @@ index 000000000..dcd4f9f41
 +	c->curr_recovery_pass = pass;
 +
 +	if (should_run_recovery_pass(c, pass)) {
-+		struct recovery_pass_fn *p = recovery_passes + pass;
++		struct recovery_pass_fn *p = recovery_pass_fns + pass;
 +
 +		if (!(p->when & PASS_SILENT))
-+			printk(KERN_INFO bch2_log_msg(c, "%s..."), p->name);
++			printk(KERN_INFO bch2_log_msg(c, "%s..."),
++			       bch2_recovery_passes[pass]);
 +		ret = p->fn(c);
 +		if (ret)
 +			return ret;
 +		if (!(p->when & PASS_SILENT))
 +			printk(KERN_CONT " done\n");
++
++		c->recovery_passes_complete |= BIT_ULL(pass);
 +	}
 +
 +	return 0;
@@ -77950,7 +78242,7 @@ index 000000000..dcd4f9f41
 +{
 +	int ret = 0;
 +
-+	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_passes)) {
++	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
 +		ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
 +		if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
 +			continue;
@@ -77970,17 +78262,17 @@ index 000000000..dcd4f9f41
 +	bool write_sb = false;
 +	int ret = 0;
 +
-+	if (c->sb.clean)
-+		clean = read_superblock_clean(c);
-+	ret = PTR_ERR_OR_ZERO(clean);
-+	if (ret)
-+		goto err;
++	if (c->sb.clean) {
++		clean = bch2_read_superblock_clean(c);
++		ret = PTR_ERR_OR_ZERO(clean);
++		if (ret)
++			goto err;
 +
-+	if (c->sb.clean)
 +		bch_info(c, "recovering from clean shutdown, journal seq %llu",
 +			 le64_to_cpu(clean->journal_seq));
-+	else
++	} else {
 +		bch_info(c, "recovering from unclean shutdown");
++	}
 +
 +	if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
 +		bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
@@ -77995,12 +78287,6 @@ index 000000000..dcd4f9f41
 +		goto err;
 +	}
 +
-+	if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
-+		bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
-+		ret = -EINVAL;
-+		goto err;
-+	}
-+
 +	if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery))
 +		check_version_upgrade(c);
 +
@@ -78060,12 +78346,12 @@ index 000000000..dcd4f9f41
 +				}
 +		}
 +
-+		ret = journal_keys_sort(c);
++		ret = bch2_journal_keys_sort(c);
 +		if (ret)
 +			goto err;
 +
 +		if (c->sb.clean && last_journal_entry) {
-+			ret = verify_superblock_clean(c, &clean,
++			ret = bch2_verify_superblock_clean(c, &clean,
 +						      last_journal_entry);
 +			if (ret)
 +				goto err;
@@ -78082,7 +78368,7 @@ index 000000000..dcd4f9f41
 +	}
 +
 +	c->journal_replay_seq_start	= last_seq;
-+	c->journal_replay_seq_end	= blacklist_seq - 1;;
++	c->journal_replay_seq_end	= blacklist_seq - 1;
 +
 +	if (c->opts.reconstruct_alloc) {
 +		c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
@@ -78145,6 +78431,29 @@ index 000000000..dcd4f9f41
 +	if (ret)
 +		goto err;
 +
++	/* If we fixed errors, verify that fs is actually clean now: */
++	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
++	    test_bit(BCH_FS_ERRORS_FIXED, &c->flags) &&
++	    !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) &&
++	    !test_bit(BCH_FS_ERROR, &c->flags)) {
++		bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
++		clear_bit(BCH_FS_ERRORS_FIXED, &c->flags);
++
++		c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
++
++		ret = bch2_run_recovery_passes(c);
++		if (ret)
++			goto err;
++
++		if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) ||
++		    test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
++			bch_err(c, "Second fsck run was not clean");
++			set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
++		}
++
++		set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
++	}
++
 +	if (enabled_qtypes(c)) {
 +		bch_verbose(c, "reading quotas");
 +		ret = bch2_fs_quota_read(c);
@@ -78177,7 +78486,6 @@ index 000000000..dcd4f9f41
 +	mutex_unlock(&c->sb_lock);
 +
 +	if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
-+	    !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
 +	    c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
 +		struct bch_move_stats stats;
 +
@@ -78245,7 +78553,7 @@ index 000000000..dcd4f9f41
 +	}
 +	mutex_unlock(&c->sb_lock);
 +
-+	c->curr_recovery_pass = ARRAY_SIZE(recovery_passes);
++	c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
 +	set_bit(BCH_FS_MAY_GO_RW, &c->flags);
 +	set_bit(BCH_FS_FSCK_DONE, &c->flags);
 +
@@ -78357,63 +78665,36 @@ index 000000000..dcd4f9f41
 +}
 diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
 new file mode 100644
-index 000000000..f8e796c0f
+index 000000000..852d30567
 --- /dev/null
 +++ b/fs/bcachefs/recovery.h
-@@ -0,0 +1,60 @@
+@@ -0,0 +1,33 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_RECOVERY_H
 +#define _BCACHEFS_RECOVERY_H
 +
-+struct journal_iter {
-+	struct list_head	list;
-+	enum btree_id		btree_id;
-+	unsigned		level;
-+	size_t			idx;
-+	struct journal_keys	*keys;
-+};
++extern const char * const bch2_recovery_passes[];
 +
 +/*
-+ * Iterate over keys in the btree, with keys from the journal overlaid on top:
++ * For when we need to rewind recovery passes and run a pass we skipped:
 + */
++static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
++						  enum bch_recovery_pass pass)
++{
++	bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
++		 bch2_recovery_passes[pass], pass,
++		 bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
 +
-+struct btree_and_journal_iter {
-+	struct btree		*b;
-+	struct btree_node_iter	node_iter;
-+	struct bkey		unpacked;
++	c->recovery_passes_explicit |= BIT_ULL(pass);
 +
-+	struct journal_iter	journal;
-+	struct bpos		pos;
-+	bool			at_end;
-+};
-+
-+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
-+				unsigned, struct bpos, struct bpos, size_t *);
-+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
-+					   unsigned, struct bpos);
-+
-+int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
-+				 unsigned, struct bkey_i *);
-+int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
-+			    unsigned, struct bkey_i *);
-+int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
-+			    unsigned, struct bpos);
-+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
-+				  unsigned, struct bpos);
-+
-+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
-+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-+
-+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-+				struct bch_fs *, struct btree *,
-+				struct btree_node_iter, struct bpos);
-+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-+						struct bch_fs *,
-+						struct btree *);
-+
-+void bch2_journal_keys_free(struct journal_keys *);
-+void bch2_journal_entries_free(struct bch_fs *);
++	if (c->curr_recovery_pass >= pass) {
++		c->curr_recovery_pass = pass;
++		c->recovery_passes_complete &= (1ULL << pass) >> 1;
++		return -BCH_ERR_restart_recovery;
++	} else {
++		return 0;
++	}
++}
 +
 +u64 bch2_fsck_recovery_passes(void);
 +
@@ -78421,6 +78702,60 @@ index 000000000..f8e796c0f
 +int bch2_fs_initialize(struct bch_fs *);
 +
 +#endif /* _BCACHEFS_RECOVERY_H */
+diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
+new file mode 100644
+index 000000000..abf1f834e
+--- /dev/null
++++ b/fs/bcachefs/recovery_types.h
+@@ -0,0 +1,48 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_RECOVERY_TYPES_H
++#define _BCACHEFS_RECOVERY_TYPES_H
++
++#define PASS_SILENT		BIT(0)
++#define PASS_FSCK		BIT(1)
++#define PASS_UNCLEAN		BIT(2)
++#define PASS_ALWAYS		BIT(3)
++
++#define BCH_RECOVERY_PASSES()									\
++	x(alloc_read,			PASS_ALWAYS)						\
++	x(stripes_read,			PASS_ALWAYS)						\
++	x(initialize_subvolumes,	0)							\
++	x(snapshots_read,		PASS_ALWAYS)						\
++	x(check_topology,		0)							\
++	x(check_allocations,		PASS_FSCK)						\
++	x(set_may_go_rw,		PASS_ALWAYS|PASS_SILENT)				\
++	x(journal_replay,		PASS_ALWAYS)						\
++	x(check_alloc_info,		PASS_FSCK)						\
++	x(check_lrus,			PASS_FSCK)						\
++	x(check_btree_backpointers,	PASS_FSCK)						\
++	x(check_backpointers_to_extents,PASS_FSCK)						\
++	x(check_extents_to_backpointers,PASS_FSCK)						\
++	x(check_alloc_to_lru_refs,	PASS_FSCK)						\
++	x(fs_freespace_init,		PASS_ALWAYS|PASS_SILENT)				\
++	x(bucket_gens_init,		0)							\
++	x(check_snapshot_trees,		PASS_FSCK)						\
++	x(check_snapshots,		PASS_FSCK)						\
++	x(check_subvols,		PASS_FSCK)						\
++	x(delete_dead_snapshots,	PASS_FSCK|PASS_UNCLEAN)					\
++	x(fs_upgrade_for_subvolumes,	0)							\
++	x(check_inodes,			PASS_FSCK)						\
++	x(check_extents,		PASS_FSCK)						\
++	x(check_dirents,		PASS_FSCK)						\
++	x(check_xattrs,			PASS_FSCK)						\
++	x(check_root,			PASS_FSCK)						\
++	x(check_directory_structure,	PASS_FSCK)						\
++	x(check_nlinks,			PASS_FSCK)						\
++	x(delete_dead_inodes,		PASS_FSCK|PASS_UNCLEAN)					\
++	x(fix_reflink_p,		0)							\
++
++enum bch_recovery_pass {
++#define x(n, when)	BCH_RECOVERY_PASS_##n,
++	BCH_RECOVERY_PASSES()
++#undef x
++};
++
++#endif /* _BCACHEFS_RECOVERY_TYPES_H */
 diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
 new file mode 100644
 index 000000000..39f711d50
@@ -80108,6 +80443,790 @@ index 000000000..5cfff489b
 +};
 +
 +#endif /* _BCACHEFS_REPLICAS_TYPES_H */
+diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
+new file mode 100644
+index 000000000..a3695e56a
+--- /dev/null
++++ b/fs/bcachefs/sb-clean.c
+@@ -0,0 +1,395 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "btree_update_interior.h"
++#include "buckets.h"
++#include "error.h"
++#include "journal_io.h"
++#include "replicas.h"
++#include "sb-clean.h"
++#include "super-io.h"
++
++/*
++ * BCH_SB_FIELD_clean:
++ *
++ * Btree roots, and a few other things, are recovered from the journal after an
++ * unclean shutdown - but after a clean shutdown, to avoid having to read the
++ * journal, we can store them in the superblock.
++ *
++ * bch_sb_field_clean simply contains a list of journal entries, stored exactly
++ * as they would be in the journal:
++ */
++
++int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
++				int write)
++{
++	struct jset_entry *entry;
++	int ret;
++
++	for (entry = clean->start;
++	     entry < (struct jset_entry *) vstruct_end(&clean->field);
++	     entry = vstruct_next(entry)) {
++		ret = bch2_journal_entry_validate(c, NULL, entry,
++						  le16_to_cpu(c->disk_sb.sb->version),
++						  BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
++						  write);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++static struct bkey_i *btree_root_find(struct bch_fs *c,
++				      struct bch_sb_field_clean *clean,
++				      struct jset *j,
++				      enum btree_id id, unsigned *level)
++{
++	struct bkey_i *k;
++	struct jset_entry *entry, *start, *end;
++
++	if (clean) {
++		start = clean->start;
++		end = vstruct_end(&clean->field);
++	} else {
++		start = j->start;
++		end = vstruct_last(j);
++	}
++
++	for (entry = start; entry < end; entry = vstruct_next(entry))
++		if (entry->type == BCH_JSET_ENTRY_btree_root &&
++		    entry->btree_id == id)
++			goto found;
++
++	return NULL;
++found:
++	if (!entry->u64s)
++		return ERR_PTR(-EINVAL);
++
++	k = entry->start;
++	*level = entry->level;
++	return k;
++}
++
++int bch2_verify_superblock_clean(struct bch_fs *c,
++				 struct bch_sb_field_clean **cleanp,
++				 struct jset *j)
++{
++	unsigned i;
++	struct bch_sb_field_clean *clean = *cleanp;
++	struct printbuf buf1 = PRINTBUF;
++	struct printbuf buf2 = PRINTBUF;
++	int ret = 0;
++
++	if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
++			"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
++			le64_to_cpu(clean->journal_seq),
++			le64_to_cpu(j->seq))) {
++		kfree(clean);
++		*cleanp = NULL;
++		return 0;
++	}
++
++	for (i = 0; i < BTREE_ID_NR; i++) {
++		struct bkey_i *k1, *k2;
++		unsigned l1 = 0, l2 = 0;
++
++		k1 = btree_root_find(c, clean, NULL, i, &l1);
++		k2 = btree_root_find(c, NULL, j, i, &l2);
++
++		if (!k1 && !k2)
++			continue;
++
++		printbuf_reset(&buf1);
++		printbuf_reset(&buf2);
++
++		if (k1)
++			bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
++		else
++			prt_printf(&buf1, "(none)");
++
++		if (k2)
++			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
++		else
++			prt_printf(&buf2, "(none)");
++
++		mustfix_fsck_err_on(!k1 || !k2 ||
++				    IS_ERR(k1) ||
++				    IS_ERR(k2) ||
++				    k1->k.u64s != k2->k.u64s ||
++				    memcmp(k1, k2, bkey_bytes(&k1->k)) ||
++				    l1 != l2, c,
++			"superblock btree root %u doesn't match journal after clean shutdown\n"
++			"sb:      l=%u %s\n"
++			"journal: l=%u %s\n", i,
++			l1, buf1.buf,
++			l2, buf2.buf);
++	}
++fsck_err:
++	printbuf_exit(&buf2);
++	printbuf_exit(&buf1);
++	return ret;
++}
++
++struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
++{
++	struct bch_sb_field_clean *clean, *sb_clean;
++	int ret;
++
++	mutex_lock(&c->sb_lock);
++	sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
++
++	if (fsck_err_on(!sb_clean, c,
++			"superblock marked clean but clean section not present")) {
++		SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
++		c->sb.clean = false;
++		mutex_unlock(&c->sb_lock);
++		return NULL;
++	}
++
++	clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
++			GFP_KERNEL);
++	if (!clean) {
++		mutex_unlock(&c->sb_lock);
++		return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
++	}
++
++	ret = bch2_sb_clean_validate_late(c, clean, READ);
++	if (ret) {
++		mutex_unlock(&c->sb_lock);
++		return ERR_PTR(ret);
++	}
++
++	mutex_unlock(&c->sb_lock);
++
++	return clean;
++fsck_err:
++	mutex_unlock(&c->sb_lock);
++	return ERR_PTR(ret);
++}
++
++static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
++{
++	struct jset_entry *entry = *end;
++	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
++
++	memset(entry, 0, u64s * sizeof(u64));
++	/*
++	 * The u64s field counts from the start of data, ignoring the shared
++	 * fields.
++	 */
++	entry->u64s = cpu_to_le16(u64s - 1);
++
++	*end = vstruct_next(*end);
++	return entry;
++}
++
++void bch2_journal_super_entries_add_common(struct bch_fs *c,
++					   struct jset_entry **end,
++					   u64 journal_seq)
++{
++	struct bch_dev *ca;
++	unsigned i, dev;
++
++	percpu_down_read(&c->mark_lock);
++
++	if (!journal_seq) {
++		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
++			bch2_fs_usage_acc_to_base(c, i);
++	} else {
++		bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
++	}
++
++	{
++		struct jset_entry_usage *u =
++			container_of(jset_entry_init(end, sizeof(*u)),
++				     struct jset_entry_usage, entry);
++
++		u->entry.type	= BCH_JSET_ENTRY_usage;
++		u->entry.btree_id = BCH_FS_USAGE_inodes;
++		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
++	}
++
++	{
++		struct jset_entry_usage *u =
++			container_of(jset_entry_init(end, sizeof(*u)),
++				     struct jset_entry_usage, entry);
++
++		u->entry.type	= BCH_JSET_ENTRY_usage;
++		u->entry.btree_id = BCH_FS_USAGE_key_version;
++		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
++	}
++
++	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
++		struct jset_entry_usage *u =
++			container_of(jset_entry_init(end, sizeof(*u)),
++				     struct jset_entry_usage, entry);
++
++		u->entry.type	= BCH_JSET_ENTRY_usage;
++		u->entry.btree_id = BCH_FS_USAGE_reserved;
++		u->entry.level	= i;
++		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
++	}
++
++	for (i = 0; i < c->replicas.nr; i++) {
++		struct bch_replicas_entry *e =
++			cpu_replicas_entry(&c->replicas, i);
++		struct jset_entry_data_usage *u =
++			container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
++				     struct jset_entry_data_usage, entry);
++
++		u->entry.type	= BCH_JSET_ENTRY_data_usage;
++		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
++		unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
++			      "embedded variable length struct");
++	}
++
++	for_each_member_device(ca, c, dev) {
++		unsigned b = sizeof(struct jset_entry_dev_usage) +
++			sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
++		struct jset_entry_dev_usage *u =
++			container_of(jset_entry_init(end, b),
++				     struct jset_entry_dev_usage, entry);
++
++		u->entry.type = BCH_JSET_ENTRY_dev_usage;
++		u->dev = cpu_to_le32(dev);
++		u->buckets_ec		= cpu_to_le64(ca->usage_base->buckets_ec);
++
++		for (i = 0; i < BCH_DATA_NR; i++) {
++			u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
++			u->d[i].sectors	= cpu_to_le64(ca->usage_base->d[i].sectors);
++			u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
++		}
++	}
++
++	percpu_up_read(&c->mark_lock);
++
++	for (i = 0; i < 2; i++) {
++		struct jset_entry_clock *clock =
++			container_of(jset_entry_init(end, sizeof(*clock)),
++				     struct jset_entry_clock, entry);
++
++		clock->entry.type = BCH_JSET_ENTRY_clock;
++		clock->rw	= i;
++		clock->time	= cpu_to_le64(atomic64_read(&c->io_clock[i].now));
++	}
++}
++
++static int bch2_sb_clean_validate(struct bch_sb *sb,
++				  struct bch_sb_field *f,
++				  struct printbuf *err)
++{
++	struct bch_sb_field_clean *clean = field_to_type(f, clean);
++
++	if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
++		prt_printf(err, "wrong size (got %zu should be %zu)",
++		       vstruct_bytes(&clean->field), sizeof(*clean));
++		return -BCH_ERR_invalid_sb_clean;
++	}
++
++	return 0;
++}
++
++static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
++				  struct bch_sb_field *f)
++{
++	struct bch_sb_field_clean *clean = field_to_type(f, clean);
++	struct jset_entry *entry;
++
++	prt_printf(out, "flags:          %x",	le32_to_cpu(clean->flags));
++	prt_newline(out);
++	prt_printf(out, "journal_seq:    %llu",	le64_to_cpu(clean->journal_seq));
++	prt_newline(out);
++
++	for (entry = clean->start;
++	     entry != vstruct_end(&clean->field);
++	     entry = vstruct_next(entry)) {
++		if (entry->type == BCH_JSET_ENTRY_btree_keys &&
++		    !entry->u64s)
++			continue;
++
++		bch2_journal_entry_to_text(out, NULL, entry);
++		prt_newline(out);
++	}
++}
++
++const struct bch_sb_field_ops bch_sb_field_ops_clean = {
++	.validate	= bch2_sb_clean_validate,
++	.to_text	= bch2_sb_clean_to_text,
++};
++
++int bch2_fs_mark_dirty(struct bch_fs *c)
++{
++	int ret;
++
++	/*
++	 * Unconditionally write superblock, to verify it hasn't changed before
++	 * we go rw:
++	 */
++
++	mutex_lock(&c->sb_lock);
++	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
++
++	bch2_sb_maybe_downgrade(c);
++	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
++
++	ret = bch2_write_super(c);
++	mutex_unlock(&c->sb_lock);
++
++	return ret;
++}
++
++void bch2_fs_mark_clean(struct bch_fs *c)
++{
++	struct bch_sb_field_clean *sb_clean;
++	struct jset_entry *entry;
++	unsigned u64s;
++	int ret;
++
++	mutex_lock(&c->sb_lock);
++	if (BCH_SB_CLEAN(c->disk_sb.sb))
++		goto out;
++
++	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
++
++	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
++	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
++	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
++	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
++
++	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
++
++	sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
++	if (!sb_clean) {
++		bch_err(c, "error resizing superblock while setting filesystem clean");
++		goto out;
++	}
++
++	sb_clean->flags		= 0;
++	sb_clean->journal_seq	= cpu_to_le64(atomic64_read(&c->journal.seq));
++
++	/* Trying to catch outstanding bug: */
++	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
++
++	entry = sb_clean->start;
++	bch2_journal_super_entries_add_common(c, &entry, 0);
++	entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
++	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
++
++	memset(entry, 0,
++	       vstruct_end(&sb_clean->field) - (void *) entry);
++
++	/*
++	 * this should be in the write path, and we should be validating every
++	 * superblock section:
++	 */
++	ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
++	if (ret) {
++		bch_err(c, "error writing marking filesystem clean: validate error");
++		goto out;
++	}
++
++	bch2_write_super(c);
++out:
++	mutex_unlock(&c->sb_lock);
++}
+diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h
+new file mode 100644
+index 000000000..71caef281
+--- /dev/null
++++ b/fs/bcachefs/sb-clean.h
+@@ -0,0 +1,16 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_SB_CLEAN_H
++#define _BCACHEFS_SB_CLEAN_H
++
++int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
++int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **,
++				 struct jset *);
++struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *);
++void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64);
++
++extern const struct bch_sb_field_ops bch_sb_field_ops_clean;
++
++int bch2_fs_mark_dirty(struct bch_fs *);
++void bch2_fs_mark_clean(struct bch_fs *);
++
++#endif /* _BCACHEFS_SB_CLEAN_H */
+diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
+new file mode 100644
+index 000000000..16a2b3389
+--- /dev/null
++++ b/fs/bcachefs/sb-members.c
+@@ -0,0 +1,173 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "disk_groups.h"
++#include "replicas.h"
++#include "sb-members.h"
++#include "super-io.h"
++
++/* Code for bch_sb_field_members: */
++
++static int bch2_sb_members_validate(struct bch_sb *sb,
++				    struct bch_sb_field *f,
++				    struct printbuf *err)
++{
++	struct bch_sb_field_members *mi = field_to_type(f, members);
++	unsigned i;
++
++	if ((void *) (mi->members + sb->nr_devices) >
++	    vstruct_end(&mi->field)) {
++		prt_printf(err, "too many devices for section size");
++		return -BCH_ERR_invalid_sb_members;
++	}
++
++	for (i = 0; i < sb->nr_devices; i++) {
++		struct bch_member *m = mi->members + i;
++
++		if (!bch2_member_exists(m))
++			continue;
++
++		if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
++			prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
++			       i, le64_to_cpu(m->nbuckets), LONG_MAX);
++			return -BCH_ERR_invalid_sb_members;
++		}
++
++		if (le64_to_cpu(m->nbuckets) -
++		    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
++			prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
++			       i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
++			return -BCH_ERR_invalid_sb_members;
++		}
++
++		if (le16_to_cpu(m->bucket_size) <
++		    le16_to_cpu(sb->block_size)) {
++			prt_printf(err, "device %u: bucket size %u smaller than block size %u",
++			       i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
++			return -BCH_ERR_invalid_sb_members;
++		}
++
++		if (le16_to_cpu(m->bucket_size) <
++		    BCH_SB_BTREE_NODE_SIZE(sb)) {
++			prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
++			       i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
++			return -BCH_ERR_invalid_sb_members;
++		}
++	}
++
++	return 0;
++}
++
++static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
++				    struct bch_sb_field *f)
++{
++	struct bch_sb_field_members *mi = field_to_type(f, members);
++	struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
++	unsigned i;
++
++	for (i = 0; i < sb->nr_devices; i++) {
++		struct bch_member *m = mi->members + i;
++		unsigned data_have = bch2_sb_dev_has_data(sb, i);
++		u64 bucket_size = le16_to_cpu(m->bucket_size);
++		u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
++
++		if (!bch2_member_exists(m))
++			continue;
++
++		prt_printf(out, "Device:");
++		prt_tab(out);
++		prt_printf(out, "%u", i);
++		prt_newline(out);
++
++		printbuf_indent_add(out, 2);
++
++		prt_printf(out, "UUID:");
++		prt_tab(out);
++		pr_uuid(out, m->uuid.b);
++		prt_newline(out);
++
++		prt_printf(out, "Size:");
++		prt_tab(out);
++		prt_units_u64(out, device_size << 9);
++		prt_newline(out);
++
++		prt_printf(out, "Bucket size:");
++		prt_tab(out);
++		prt_units_u64(out, bucket_size << 9);
++		prt_newline(out);
++
++		prt_printf(out, "First bucket:");
++		prt_tab(out);
++		prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
++		prt_newline(out);
++
++		prt_printf(out, "Buckets:");
++		prt_tab(out);
++		prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
++		prt_newline(out);
++
++		prt_printf(out, "Last mount:");
++		prt_tab(out);
++		if (m->last_mount)
++			pr_time(out, le64_to_cpu(m->last_mount));
++		else
++			prt_printf(out, "(never)");
++		prt_newline(out);
++
++		prt_printf(out, "State:");
++		prt_tab(out);
++		prt_printf(out, "%s",
++		       BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
++		       ? bch2_member_states[BCH_MEMBER_STATE(m)]
++		       : "unknown");
++		prt_newline(out);
++
++		prt_printf(out, "Label:");
++		prt_tab(out);
++		if (BCH_MEMBER_GROUP(m)) {
++			unsigned idx = BCH_MEMBER_GROUP(m) - 1;
++
++			if (idx < disk_groups_nr(gi))
++				prt_printf(out, "%s (%u)",
++				       gi->entries[idx].label, idx);
++			else
++				prt_printf(out, "(bad disk labels section)");
++		} else {
++			prt_printf(out, "(none)");
++		}
++		prt_newline(out);
++
++		prt_printf(out, "Data allowed:");
++		prt_tab(out);
++		if (BCH_MEMBER_DATA_ALLOWED(m))
++			prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
++		else
++			prt_printf(out, "(none)");
++		prt_newline(out);
++
++		prt_printf(out, "Has data:");
++		prt_tab(out);
++		if (data_have)
++			prt_bitflags(out, bch2_data_types, data_have);
++		else
++			prt_printf(out, "(none)");
++		prt_newline(out);
++
++		prt_printf(out, "Discard:");
++		prt_tab(out);
++		prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
++		prt_newline(out);
++
++		prt_printf(out, "Freespace initialized:");
++		prt_tab(out);
++		prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
++		prt_newline(out);
++
++		printbuf_indent_sub(out, 2);
++	}
++}
++
++const struct bch_sb_field_ops bch_sb_field_ops_members = {
++	.validate	= bch2_sb_members_validate,
++	.to_text	= bch2_sb_members_to_text,
++};
+diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
+new file mode 100644
+index 000000000..34e1cf604
+--- /dev/null
++++ b/fs/bcachefs/sb-members.h
+@@ -0,0 +1,176 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_SB_MEMBERS_H
++#define _BCACHEFS_SB_MEMBERS_H
++
++static inline bool bch2_dev_is_online(struct bch_dev *ca)
++{
++	return !percpu_ref_is_zero(&ca->io_ref);
++}
++
++static inline bool bch2_dev_is_readable(struct bch_dev *ca)
++{
++	return bch2_dev_is_online(ca) &&
++		ca->mi.state != BCH_MEMBER_STATE_failed;
++}
++
++static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
++{
++	if (!percpu_ref_tryget(&ca->io_ref))
++		return false;
++
++	if (ca->mi.state == BCH_MEMBER_STATE_rw ||
++	    (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
++		return true;
++
++	percpu_ref_put(&ca->io_ref);
++	return false;
++}
++
++static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
++{
++	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
++}
++
++static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
++					 unsigned dev)
++{
++	unsigned i;
++
++	for (i = 0; i < devs.nr; i++)
++		if (devs.devs[i] == dev)
++			return true;
++
++	return false;
++}
++
++static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
++					  unsigned dev)
++{
++	unsigned i;
++
++	for (i = 0; i < devs->nr; i++)
++		if (devs->devs[i] == dev) {
++			array_remove_item(devs->devs, devs->nr, i);
++			return;
++		}
++}
++
++static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
++					 unsigned dev)
++{
++	if (!bch2_dev_list_has_dev(*devs, dev)) {
++		BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
++		devs->devs[devs->nr++] = dev;
++	}
++}
++
++static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
++{
++	return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
++}
++
++static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
++					      const struct bch_devs_mask *mask)
++{
++	struct bch_dev *ca = NULL;
++
++	while ((*iter = mask
++		? find_next_bit(mask->d, c->sb.nr_devices, *iter)
++		: *iter) < c->sb.nr_devices &&
++	       !(ca = rcu_dereference_check(c->devs[*iter],
++					    lockdep_is_held(&c->state_lock))))
++		(*iter)++;
++
++	return ca;
++}
++
++#define for_each_member_device_rcu(ca, c, iter, mask)			\
++	for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
++
++static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
++{
++	struct bch_dev *ca;
++
++	rcu_read_lock();
++	if ((ca = __bch2_next_dev(c, iter, NULL)))
++		percpu_ref_get(&ca->ref);
++	rcu_read_unlock();
++
++	return ca;
++}
++
++/*
++ * If you break early, you must drop your ref on the current device
++ */
++#define for_each_member_device(ca, c, iter)				\
++	for ((iter) = 0;						\
++	     (ca = bch2_get_next_dev(c, &(iter)));			\
++	     percpu_ref_put(&ca->ref), (iter)++)
++
++static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
++						      unsigned *iter,
++						      int state_mask)
++{
++	struct bch_dev *ca;
++
++	rcu_read_lock();
++	while ((ca = __bch2_next_dev(c, iter, NULL)) &&
++	       (!((1 << ca->mi.state) & state_mask) ||
++		!percpu_ref_tryget(&ca->io_ref)))
++		(*iter)++;
++	rcu_read_unlock();
++
++	return ca;
++}
++
++#define __for_each_online_member(ca, c, iter, state_mask)		\
++	for ((iter) = 0;						\
++	     (ca = bch2_get_next_online_dev(c, &(iter), state_mask));	\
++	     percpu_ref_put(&ca->io_ref), (iter)++)
++
++#define for_each_online_member(ca, c, iter)				\
++	__for_each_online_member(ca, c, iter, ~0)
++
++#define for_each_rw_member(ca, c, iter)					\
++	__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
++
++#define for_each_readable_member(ca, c, iter)				\
++	__for_each_online_member(ca, c, iter,				\
++		(1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
++
++/*
++ * If a key exists that references a device, the device won't be going away and
++ * we can omit rcu_read_lock():
++ */
++static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
++{
++	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
++
++	return rcu_dereference_check(c->devs[idx], 1);
++}
++
++static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
++{
++	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
++
++	return rcu_dereference_protected(c->devs[idx],
++					 lockdep_is_held(&c->sb_lock) ||
++					 lockdep_is_held(&c->state_lock));
++}
++
++/* XXX kill, move to struct bch_fs */
++static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
++{
++	struct bch_devs_mask devs;
++	struct bch_dev *ca;
++	unsigned i;
++
++	memset(&devs, 0, sizeof(devs));
++	for_each_online_member(ca, c, i)
++		__set_bit(ca->dev_idx, devs.d);
++	return devs;
++}
++
++extern const struct bch_sb_field_ops bch_sb_field_ops_members;
++
++#endif /* _BCACHEFS_SB_MEMBERS_H */
 diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h
 new file mode 100644
 index 000000000..c1860d816
@@ -80434,6 +81553,3295 @@ index 000000000..3dfaf34a4
 +#define SipHash48(_k, _p, _l)		SipHash((_k), 4, 8, (_p), (_l))
 +
 +#endif /* _SIPHASH_H_ */
+diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
+new file mode 100644
+index 000000000..14cffa68d
+--- /dev/null
++++ b/fs/bcachefs/six.c
+@@ -0,0 +1,918 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include <linux/export.h>
++#include <linux/log2.h>
++#include <linux/percpu.h>
++#include <linux/preempt.h>
++#include <linux/rcupdate.h>
++#include <linux/sched.h>
++#include <linux/sched/clock.h>
++#include <linux/sched/rt.h>
++#include <linux/sched/task.h>
++#include <linux/slab.h>
++
++#include <trace/events/lock.h>
++
++#include "six.h"
++
++#ifdef DEBUG
++#define EBUG_ON(cond)			BUG_ON(cond)
++#else
++#define EBUG_ON(cond)			do {} while (0)
++#endif
++
++#define six_acquire(l, t, r, ip)	lock_acquire(l, 0, t, r, 1, NULL, ip)
++#define six_release(l, ip)		lock_release(l, ip)
++
++static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
++
++#define SIX_LOCK_HELD_read_OFFSET	0
++#define SIX_LOCK_HELD_read		~(~0U << 26)
++#define SIX_LOCK_HELD_intent		(1U << 26)
++#define SIX_LOCK_HELD_write		(1U << 27)
++#define SIX_LOCK_WAITING_read		(1U << (28 + SIX_LOCK_read))
++#define SIX_LOCK_WAITING_intent		(1U << (28 + SIX_LOCK_intent))
++#define SIX_LOCK_WAITING_write		(1U << (28 + SIX_LOCK_write))
++#define SIX_LOCK_NOSPIN			(1U << 31)
++
++struct six_lock_vals {
++	/* Value we add to the lock in order to take the lock: */
++	u32			lock_val;
++
++	/* If the lock has this value (used as a mask), taking the lock fails: */
++	u32			lock_fail;
++
++	/* Mask that indicates lock is held for this type: */
++	u32			held_mask;
++
++	/* Waitlist we wakeup when releasing the lock: */
++	enum six_lock_type	unlock_wakeup;
++};
++
++static const struct six_lock_vals l[] = {
++	[SIX_LOCK_read] = {
++		.lock_val	= 1U << SIX_LOCK_HELD_read_OFFSET,
++		.lock_fail	= SIX_LOCK_HELD_write,
++		.held_mask	= SIX_LOCK_HELD_read,
++		.unlock_wakeup	= SIX_LOCK_write,
++	},
++	[SIX_LOCK_intent] = {
++		.lock_val	= SIX_LOCK_HELD_intent,
++		.lock_fail	= SIX_LOCK_HELD_intent,
++		.held_mask	= SIX_LOCK_HELD_intent,
++		.unlock_wakeup	= SIX_LOCK_intent,
++	},
++	[SIX_LOCK_write] = {
++		.lock_val	= SIX_LOCK_HELD_write,
++		.lock_fail	= SIX_LOCK_HELD_read,
++		.held_mask	= SIX_LOCK_HELD_write,
++		.unlock_wakeup	= SIX_LOCK_read,
++	},
++};
++
++static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
++{
++	if ((atomic_read(&lock->state) & mask) != mask)
++		atomic_or(mask, &lock->state);
++}
++
++static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
++{
++	if (atomic_read(&lock->state) & mask)
++		atomic_and(~mask, &lock->state);
++}
++
++static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
++				 u32 old, struct task_struct *owner)
++{
++	if (type != SIX_LOCK_intent)
++		return;
++
++	if (!(old & SIX_LOCK_HELD_intent)) {
++		EBUG_ON(lock->owner);
++		lock->owner = owner;
++	} else {
++		EBUG_ON(lock->owner != current);
++	}
++}
++
++static inline unsigned pcpu_read_count(struct six_lock *lock)
++{
++	unsigned read_count = 0;
++	int cpu;
++
++	for_each_possible_cpu(cpu)
++		read_count += *per_cpu_ptr(lock->readers, cpu);
++	return read_count;
++}
++
++/*
++ * __do_six_trylock() - main trylock routine
++ *
++ * Returns 1 on success, 0 on failure
++ *
++ * In percpu reader mode, a failed trylock may cause a spurious trylock failure
++ * for anoter thread taking the competing lock type, and we may havve to do a
++ * wakeup: when a wakeup is required, we return -1 - wakeup_type.
++ */
++static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
++			    struct task_struct *task, bool try)
++{
++	int ret;
++	u32 old;
++
++	EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
++	EBUG_ON(type == SIX_LOCK_write &&
++		(try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
++
++	/*
++	 * Percpu reader mode:
++	 *
++	 * The basic idea behind this algorithm is that you can implement a lock
++	 * between two threads without any atomics, just memory barriers:
++	 *
++	 * For two threads you'll need two variables, one variable for "thread a
++	 * has the lock" and another for "thread b has the lock".
++	 *
++	 * To take the lock, a thread sets its variable indicating that it holds
++	 * the lock, then issues a full memory barrier, then reads from the
++	 * other thread's variable to check if the other thread thinks it has
++	 * the lock. If we raced, we backoff and retry/sleep.
++	 *
++	 * Failure to take the lock may cause a spurious trylock failure in
++	 * another thread, because we temporarily set the lock to indicate that
++	 * we held it. This would be a problem for a thread in six_lock(), when
++	 * they are calling trylock after adding themself to the waitlist and
++	 * prior to sleeping.
++	 *
++	 * Therefore, if we fail to get the lock, and there were waiters of the
++	 * type we conflict with, we will have to issue a wakeup.
++	 *
++	 * Since we may be called under wait_lock (and by the wakeup code
++	 * itself), we return that the wakeup has to be done instead of doing it
++	 * here.
++	 */
++	if (type == SIX_LOCK_read && lock->readers) {
++		preempt_disable();
++		this_cpu_inc(*lock->readers); /* signal that we own lock */
++
++		smp_mb();
++
++		old = atomic_read(&lock->state);
++		ret = !(old & l[type].lock_fail);
++
++		this_cpu_sub(*lock->readers, !ret);
++		preempt_enable();
++
++		if (!ret && (old & SIX_LOCK_WAITING_write))
++			ret = -1 - SIX_LOCK_write;
++	} else if (type == SIX_LOCK_write && lock->readers) {
++		if (try) {
++			atomic_add(SIX_LOCK_HELD_write, &lock->state);
++			smp_mb__after_atomic();
++		}
++
++		ret = !pcpu_read_count(lock);
++
++		if (try && !ret) {
++			old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
++			if (old & SIX_LOCK_WAITING_read)
++				ret = -1 - SIX_LOCK_read;
++		}
++	} else {
++		old = atomic_read(&lock->state);
++		do {
++			ret = !(old & l[type].lock_fail);
++			if (!ret || (type == SIX_LOCK_write && !try)) {
++				smp_mb();
++				break;
++			}
++		} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
++
++		EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
++	}
++
++	if (ret > 0)
++		six_set_owner(lock, type, old, task);
++
++	EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
++		(atomic_read(&lock->state) & SIX_LOCK_HELD_write));
++
++	return ret;
++}
++
++static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
++{
++	struct six_lock_waiter *w, *next;
++	struct task_struct *task;
++	bool saw_one;
++	int ret;
++again:
++	ret = 0;
++	saw_one = false;
++	raw_spin_lock(&lock->wait_lock);
++
++	list_for_each_entry_safe(w, next, &lock->wait_list, list) {
++		if (w->lock_want != lock_type)
++			continue;
++
++		if (saw_one && lock_type != SIX_LOCK_read)
++			goto unlock;
++		saw_one = true;
++
++		ret = __do_six_trylock(lock, lock_type, w->task, false);
++		if (ret <= 0)
++			goto unlock;
++
++		/*
++		 * Similar to percpu_rwsem_wake_function(), we need to guard
++		 * against the wakee noticing w->lock_acquired, returning, and
++		 * then exiting before we do the wakeup:
++		 */
++		task = get_task_struct(w->task);
++		__list_del(w->list.prev, w->list.next);
++		/*
++		 * The release barrier here ensures the ordering of the
++		 * __list_del before setting w->lock_acquired; @w is on the
++		 * stack of the thread doing the waiting and will be reused
++		 * after it sees w->lock_acquired with no other locking:
++		 * pairs with smp_load_acquire() in six_lock_slowpath()
++		 */
++		smp_store_release(&w->lock_acquired, true);
++		wake_up_process(task);
++		put_task_struct(task);
++	}
++
++	six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
++unlock:
++	raw_spin_unlock(&lock->wait_lock);
++
++	if (ret < 0) {
++		lock_type = -ret - 1;
++		goto again;
++	}
++}
++
++__always_inline
++static void six_lock_wakeup(struct six_lock *lock, u32 state,
++			    enum six_lock_type lock_type)
++{
++	if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
++		return;
++
++	if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
++		return;
++
++	__six_lock_wakeup(lock, lock_type);
++}
++
++__always_inline
++static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
++{
++	int ret;
++
++	ret = __do_six_trylock(lock, type, current, try);
++	if (ret < 0)
++		__six_lock_wakeup(lock, -ret - 1);
++
++	return ret > 0;
++}
++
++/**
++ * six_trylock_ip - attempt to take a six lock without blocking
++ * @lock:	lock to take
++ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
++ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
++ *
++ * Return: true on success, false on failure.
++ */
++bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
++{
++	if (!do_six_trylock(lock, type, true))
++		return false;
++
++	if (type != SIX_LOCK_write)
++		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
++	return true;
++}
++EXPORT_SYMBOL_GPL(six_trylock_ip);
++
++/**
++ * six_relock_ip - attempt to re-take a lock that was held previously
++ * @lock:	lock to take
++ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
++ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
++ *		held previously
++ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
++ *
++ * Return: true on success, false on failure.
++ */
++bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
++		   unsigned seq, unsigned long ip)
++{
++	if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
++		return false;
++
++	if (six_lock_seq(lock) != seq) {
++		six_unlock_ip(lock, type, ip);
++		return false;
++	}
++
++	return true;
++}
++EXPORT_SYMBOL_GPL(six_relock_ip);
++
++#ifdef CONFIG_LOCK_SPIN_ON_OWNER
++
++static inline bool six_can_spin_on_owner(struct six_lock *lock)
++{
++	struct task_struct *owner;
++	bool ret;
++
++	if (need_resched())
++		return false;
++
++	rcu_read_lock();
++	owner = READ_ONCE(lock->owner);
++	ret = !owner || owner_on_cpu(owner);
++	rcu_read_unlock();
++
++	return ret;
++}
++
++static inline bool six_spin_on_owner(struct six_lock *lock,
++				     struct task_struct *owner,
++				     u64 end_time)
++{
++	bool ret = true;
++	unsigned loop = 0;
++
++	rcu_read_lock();
++	while (lock->owner == owner) {
++		/*
++		 * Ensure we emit the owner->on_cpu, dereference _after_
++		 * checking lock->owner still matches owner. If that fails,
++		 * owner might point to freed memory. If it still matches,
++		 * the rcu_read_lock() ensures the memory stays valid.
++		 */
++		barrier();
++
++		if (!owner_on_cpu(owner) || need_resched()) {
++			ret = false;
++			break;
++		}
++
++		if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
++			six_set_bitmask(lock, SIX_LOCK_NOSPIN);
++			ret = false;
++			break;
++		}
++
++		cpu_relax();
++	}
++	rcu_read_unlock();
++
++	return ret;
++}
++
++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
++{
++	struct task_struct *task = current;
++	u64 end_time;
++
++	if (type == SIX_LOCK_write)
++		return false;
++
++	preempt_disable();
++	if (!six_can_spin_on_owner(lock))
++		goto fail;
++
++	if (!osq_lock(&lock->osq))
++		goto fail;
++
++	end_time = sched_clock() + 10 * NSEC_PER_USEC;
++
++	while (1) {
++		struct task_struct *owner;
++
++		/*
++		 * If there's an owner, wait for it to either
++		 * release the lock or go to sleep.
++		 */
++		owner = READ_ONCE(lock->owner);
++		if (owner && !six_spin_on_owner(lock, owner, end_time))
++			break;
++
++		if (do_six_trylock(lock, type, false)) {
++			osq_unlock(&lock->osq);
++			preempt_enable();
++			return true;
++		}
++
++		/*
++		 * When there's no owner, we might have preempted between the
++		 * owner acquiring the lock and setting the owner field. If
++		 * we're an RT task that will live-lock because we won't let
++		 * the owner complete.
++		 */
++		if (!owner && (need_resched() || rt_task(task)))
++			break;
++
++		/*
++		 * The cpu_relax() call is a compiler barrier which forces
++		 * everything in this loop to be re-loaded. We don't need
++		 * memory barriers as we'll eventually observe the right
++		 * values at the cost of a few extra spins.
++		 */
++		cpu_relax();
++	}
++
++	osq_unlock(&lock->osq);
++fail:
++	preempt_enable();
++
++	/*
++	 * If we fell out of the spin path because of need_resched(),
++	 * reschedule now, before we try-lock again. This avoids getting
++	 * scheduled out right after we obtained the lock.
++	 */
++	if (need_resched())
++		schedule();
++
++	return false;
++}
++
++#else /* CONFIG_LOCK_SPIN_ON_OWNER */
++
++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
++{
++	return false;
++}
++
++#endif
++
++noinline
++static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
++			     struct six_lock_waiter *wait,
++			     six_lock_should_sleep_fn should_sleep_fn, void *p,
++			     unsigned long ip)
++{
++	int ret = 0;
++
++	if (type == SIX_LOCK_write) {
++		EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
++		atomic_add(SIX_LOCK_HELD_write, &lock->state);
++		smp_mb__after_atomic();
++	}
++
++	trace_contention_begin(lock, 0);
++	lock_contended(&lock->dep_map, ip);
++
++	if (six_optimistic_spin(lock, type))
++		goto out;
++
++	wait->task		= current;
++	wait->lock_want		= type;
++	wait->lock_acquired	= false;
++
++	raw_spin_lock(&lock->wait_lock);
++	six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
++	/*
++	 * Retry taking the lock after taking waitlist lock, in case we raced
++	 * with an unlock:
++	 */
++	ret = __do_six_trylock(lock, type, current, false);
++	if (ret <= 0) {
++		wait->start_time = local_clock();
++
++		if (!list_empty(&lock->wait_list)) {
++			struct six_lock_waiter *last =
++				list_last_entry(&lock->wait_list,
++					struct six_lock_waiter, list);
++
++			if (time_before_eq64(wait->start_time, last->start_time))
++				wait->start_time = last->start_time + 1;
++		}
++
++		list_add_tail(&wait->list, &lock->wait_list);
++	}
++	raw_spin_unlock(&lock->wait_lock);
++
++	if (unlikely(ret > 0)) {
++		ret = 0;
++		goto out;
++	}
++
++	if (unlikely(ret < 0)) {
++		__six_lock_wakeup(lock, -ret - 1);
++		ret = 0;
++	}
++
++	while (1) {
++		set_current_state(TASK_UNINTERRUPTIBLE);
++
++		/*
++		 * Ensures that writes to the waitlist entry happen after we see
++		 * wait->lock_acquired: pairs with the smp_store_release in
++		 * __six_lock_wakeup
++		 */
++		if (smp_load_acquire(&wait->lock_acquired))
++			break;
++
++		ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
++		if (unlikely(ret)) {
++			bool acquired;
++
++			/*
++			 * If should_sleep_fn() returns an error, we are
++			 * required to return that error even if we already
++			 * acquired the lock - should_sleep_fn() might have
++			 * modified external state (e.g. when the deadlock cycle
++			 * detector in bcachefs issued a transaction restart)
++			 */
++			raw_spin_lock(&lock->wait_lock);
++			acquired = wait->lock_acquired;
++			if (!acquired)
++				list_del(&wait->list);
++			raw_spin_unlock(&lock->wait_lock);
++
++			if (unlikely(acquired))
++				do_six_unlock_type(lock, type);
++			break;
++		}
++
++		schedule();
++	}
++
++	__set_current_state(TASK_RUNNING);
++out:
++	if (ret && type == SIX_LOCK_write) {
++		six_clear_bitmask(lock, SIX_LOCK_HELD_write);
++		six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
++	}
++	trace_contention_end(lock, 0);
++
++	return ret;
++}
++
++/**
++ * six_lock_ip_waiter - take a lock, with full waitlist interface
++ * @lock:	lock to take
++ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
++ * @wait:	pointer to wait object, which will be added to lock's waitlist
++ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
++ *		to scheduling
++ * @p:		passed through to @should_sleep_fn
++ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
++ *
++ * This is the most general six_lock() variant, with parameters to support full
++ * cycle detection for deadlock avoidance.
++ *
++ * The code calling this function must implement tracking of held locks, and the
++ * @wait object should be embedded into the struct that tracks held locks -
++ * which must also be accessible in a thread-safe way.
++ *
++ * @should_sleep_fn should invoke the cycle detector; it should walk each
++ * lock's waiters, and for each waiter recursively walk their held locks.
++ *
++ * When this function must block, @wait will be added to @lock's waitlist before
++ * calling trylock, and before calling @should_sleep_fn, and @wait will not be
++ * removed from the lock waitlist until the lock has been successfully acquired,
++ * or we abort.
++ *
++ * @wait.start_time will be monotonically increasing for any given waitlist, and
++ * thus may be used as a loop cursor.
++ *
++ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
++ */
++int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
++		       struct six_lock_waiter *wait,
++		       six_lock_should_sleep_fn should_sleep_fn, void *p,
++		       unsigned long ip)
++{
++	int ret;
++
++	wait->start_time = 0;
++
++	if (type != SIX_LOCK_write)
++		six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
++
++	ret = do_six_trylock(lock, type, true) ? 0
++		: six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
++
++	if (ret && type != SIX_LOCK_write)
++		six_release(&lock->dep_map, ip);
++	if (!ret)
++		lock_acquired(&lock->dep_map, ip);
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
++
++__always_inline
++static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
++{
++	u32 state;
++
++	if (type == SIX_LOCK_intent)
++		lock->owner = NULL;
++
++	if (type == SIX_LOCK_read &&
++	    lock->readers) {
++		smp_mb(); /* unlock barrier */
++		this_cpu_dec(*lock->readers);
++		smp_mb(); /* between unlocking and checking for waiters */
++		state = atomic_read(&lock->state);
++	} else {
++		u32 v = l[type].lock_val;
++
++		if (type != SIX_LOCK_read)
++			v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
++
++		EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
++		state = atomic_sub_return_release(v, &lock->state);
++	}
++
++	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
++}
++
++/**
++ * six_unlock_ip - drop a six lock
++ * @lock:	lock to unlock
++ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
++ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
++ *
++ * When a lock is held multiple times (because six_lock_incement()) was used),
++ * this decrements the 'lock held' counter by one.
++ *
++ * For example:
++ * six_lock_read(&foo->lock);				read count 1
++ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
++ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
++ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
++ */
++void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
++{
++	EBUG_ON(type == SIX_LOCK_write &&
++		!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
++	EBUG_ON((type == SIX_LOCK_write ||
++		 type == SIX_LOCK_intent) &&
++		lock->owner != current);
++
++	if (type != SIX_LOCK_write)
++		six_release(&lock->dep_map, ip);
++	else
++		lock->seq++;
++
++	if (type == SIX_LOCK_intent &&
++	    lock->intent_lock_recurse) {
++		--lock->intent_lock_recurse;
++		return;
++	}
++
++	do_six_unlock_type(lock, type);
++}
++EXPORT_SYMBOL_GPL(six_unlock_ip);
++
++/**
++ * six_lock_downgrade - convert an intent lock to a read lock
++ * @lock:	lock to dowgrade
++ *
++ * @lock will have read count incremented and intent count decremented
++ */
++void six_lock_downgrade(struct six_lock *lock)
++{
++	six_lock_increment(lock, SIX_LOCK_read);
++	six_unlock_intent(lock);
++}
++EXPORT_SYMBOL_GPL(six_lock_downgrade);
++
++/**
++ * six_lock_tryupgrade - attempt to convert read lock to an intent lock
++ * @lock:	lock to upgrade
++ *
++ * On success, @lock will have intent count incremented and read count
++ * decremented
++ *
++ * Return: true on success, false on failure
++ */
++bool six_lock_tryupgrade(struct six_lock *lock)
++{
++	u32 old = atomic_read(&lock->state), new;
++
++	do {
++		new = old;
++
++		if (new & SIX_LOCK_HELD_intent)
++			return false;
++
++		if (!lock->readers) {
++			EBUG_ON(!(new & SIX_LOCK_HELD_read));
++			new -= l[SIX_LOCK_read].lock_val;
++		}
++
++		new |= SIX_LOCK_HELD_intent;
++	} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
++
++	if (lock->readers)
++		this_cpu_dec(*lock->readers);
++
++	six_set_owner(lock, SIX_LOCK_intent, old, current);
++
++	return true;
++}
++EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
++
++/**
++ * six_trylock_convert - attempt to convert a held lock from one type to another
++ * @lock:	lock to upgrade
++ * @from:	SIX_LOCK_read or SIX_LOCK_intent
++ * @to:		SIX_LOCK_read or SIX_LOCK_intent
++ *
++ * On success, @lock will have intent count incremented and read count
++ * decremented
++ *
++ * Return: true on success, false on failure
++ */
++bool six_trylock_convert(struct six_lock *lock,
++			 enum six_lock_type from,
++			 enum six_lock_type to)
++{
++	EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
++
++	if (to == from)
++		return true;
++
++	if (to == SIX_LOCK_read) {
++		six_lock_downgrade(lock);
++		return true;
++	} else {
++		return six_lock_tryupgrade(lock);
++	}
++}
++EXPORT_SYMBOL_GPL(six_trylock_convert);
++
++/**
++ * six_lock_increment - increase held lock count on a lock that is already held
++ * @lock:	lock to increment
++ * @type:	SIX_LOCK_read or SIX_LOCK_intent
++ *
++ * @lock must already be held, with a lock type that is greater than or equal to
++ * @type
++ *
++ * A corresponding six_unlock_type() call will be required for @lock to be fully
++ * unlocked.
++ */
++void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
++{
++	six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
++
++	/* XXX: assert already locked, and that we don't overflow: */
++
++	switch (type) {
++	case SIX_LOCK_read:
++		if (lock->readers) {
++			this_cpu_inc(*lock->readers);
++		} else {
++			EBUG_ON(!(atomic_read(&lock->state) &
++				  (SIX_LOCK_HELD_read|
++				   SIX_LOCK_HELD_intent)));
++			atomic_add(l[type].lock_val, &lock->state);
++		}
++		break;
++	case SIX_LOCK_intent:
++		EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
++		lock->intent_lock_recurse++;
++		break;
++	case SIX_LOCK_write:
++		BUG();
++		break;
++	}
++}
++EXPORT_SYMBOL_GPL(six_lock_increment);
++
++/**
++ * six_lock_wakeup_all - wake up all waiters on @lock
++ * @lock:	lock to wake up waiters for
++ *
++ * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
++ * abort the lock operation.
++ *
++ * This function is never needed in a bug-free program; it's only useful in
++ * debug code, e.g. to determine if a cycle detector is at fault.
++ */
++void six_lock_wakeup_all(struct six_lock *lock)
++{
++	u32 state = atomic_read(&lock->state);
++	struct six_lock_waiter *w;
++
++	six_lock_wakeup(lock, state, SIX_LOCK_read);
++	six_lock_wakeup(lock, state, SIX_LOCK_intent);
++	six_lock_wakeup(lock, state, SIX_LOCK_write);
++
++	raw_spin_lock(&lock->wait_lock);
++	list_for_each_entry(w, &lock->wait_list, list)
++		wake_up_process(w->task);
++	raw_spin_unlock(&lock->wait_lock);
++}
++EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
++
++/**
++ * six_lock_counts - return held lock counts, for each lock type
++ * @lock:	lock to return counters for
++ *
++ * Return: the number of times a lock is held for read, intent and write.
++ */
++struct six_lock_count six_lock_counts(struct six_lock *lock)
++{
++	struct six_lock_count ret;
++
++	ret.n[SIX_LOCK_read]	= !lock->readers
++		? atomic_read(&lock->state) & SIX_LOCK_HELD_read
++		: pcpu_read_count(lock);
++	ret.n[SIX_LOCK_intent]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
++		lock->intent_lock_recurse;
++	ret.n[SIX_LOCK_write]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(six_lock_counts);
++
++/**
++ * six_lock_readers_add - directly manipulate reader count of a lock
++ * @lock:	lock to add/subtract readers for
++ * @nr:		reader count to add/subtract
++ *
++ * When an upper layer is implementing lock reentrency, we may have both read
++ * and intent locks on the same lock.
++ *
++ * When we need to take a write lock, the read locks will cause self-deadlock,
++ * because six locks themselves do not track which read locks are held by the
++ * current thread and which are held by a different thread - it does no
++ * per-thread tracking of held locks.
++ *
++ * The upper layer that is tracking held locks may however, if trylock() has
++ * failed, count up its own read locks, subtract them, take the write lock, and
++ * then re-add them.
++ *
++ * As in any other situation when taking a write lock, @lock must be held for
++ * intent one (or more) times, so @lock will never be left unlocked.
++ */
++void six_lock_readers_add(struct six_lock *lock, int nr)
++{
++	if (lock->readers) {
++		this_cpu_add(*lock->readers, nr);
++	} else {
++		EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
++		/* reader count starts at bit 0 */
++		atomic_add(nr, &lock->state);
++	}
++}
++EXPORT_SYMBOL_GPL(six_lock_readers_add);
++
++/**
++ * six_lock_exit - release resources held by a lock prior to freeing
++ * @lock:	lock to exit
++ *
++ * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
++ * required to free the percpu read counts.
++ */
++void six_lock_exit(struct six_lock *lock)
++{
++	WARN_ON(lock->readers && pcpu_read_count(lock));
++	WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
++
++	free_percpu(lock->readers);
++	lock->readers = NULL;
++}
++EXPORT_SYMBOL_GPL(six_lock_exit);
++
++void __six_lock_init(struct six_lock *lock, const char *name,
++		     struct lock_class_key *key, enum six_lock_init_flags flags)
++{
++	atomic_set(&lock->state, 0);
++	raw_spin_lock_init(&lock->wait_lock);
++	INIT_LIST_HEAD(&lock->wait_list);
++#ifdef CONFIG_DEBUG_LOCK_ALLOC
++	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
++	lockdep_init_map(&lock->dep_map, name, key, 0);
++#endif
++
++	/*
++	 * Don't assume that we have real percpu variables available in
++	 * userspace:
++	 */
++#ifdef __KERNEL__
++	if (flags & SIX_LOCK_INIT_PCPU) {
++		/*
++		 * We don't return an error here on memory allocation failure
++		 * since percpu is an optimization, and locks will work with the
++		 * same semantics in non-percpu mode: callers can check for
++		 * failure if they wish by checking lock->readers, but generally
++		 * will not want to treat it as an error.
++		 */
++		lock->readers = alloc_percpu(unsigned);
++	}
++#endif
++}
++EXPORT_SYMBOL_GPL(__six_lock_init);
+diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
+new file mode 100644
+index 000000000..394da423c
+--- /dev/null
++++ b/fs/bcachefs/six.h
+@@ -0,0 +1,388 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++
++#ifndef _LINUX_SIX_H
++#define _LINUX_SIX_H
++
++/**
++ * DOC: SIX locks overview
++ *
++ * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
++ * but with an additional state: read/shared, intent, exclusive/write
++ *
++ * The purpose of the intent state is to allow for greater concurrency on tree
++ * structures without deadlocking. In general, a read can't be upgraded to a
++ * write lock without deadlocking, so an operation that updates multiple nodes
++ * will have to take write locks for the full duration of the operation.
++ *
++ * But by adding an intent state, which is exclusive with other intent locks but
++ * not with readers, we can take intent locks at thte start of the operation,
++ * and then take write locks only for the actual update to each individual
++ * nodes, without deadlocking.
++ *
++ * Example usage:
++ *   six_lock_read(&foo->lock);
++ *   six_unlock_read(&foo->lock);
++ *
++ * An intent lock must be held before taking a write lock:
++ *   six_lock_intent(&foo->lock);
++ *   six_lock_write(&foo->lock);
++ *   six_unlock_write(&foo->lock);
++ *   six_unlock_intent(&foo->lock);
++ *
++ * Other operations:
++ *   six_trylock_read()
++ *   six_trylock_intent()
++ *   six_trylock_write()
++ *
++ *   six_lock_downgrade()	convert from intent to read
++ *   six_lock_tryupgrade()	attempt to convert from read to intent, may fail
++ *
++ * There are also interfaces that take the lock type as an enum:
++ *
++ *   six_lock_type(&foo->lock, SIX_LOCK_read);
++ *   six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
++ *   six_lock_type(&foo->lock, SIX_LOCK_write);
++ *   six_unlock_type(&foo->lock, SIX_LOCK_write);
++ *   six_unlock_type(&foo->lock, SIX_LOCK_intent);
++ *
++ * Lock sequence numbers - unlock(), relock():
++ *
++ *   Locks embed sequences numbers, which are incremented on write lock/unlock.
++ *   This allows locks to be dropped and the retaken iff the state they protect
++ *   hasn't changed; this makes it much easier to avoid holding locks while e.g.
++ *   doing IO or allocating memory.
++ *
++ *   Example usage:
++ *     six_lock_read(&foo->lock);
++ *     u32 seq = six_lock_seq(&foo->lock);
++ *     six_unlock_read(&foo->lock);
++ *
++ *     some_operation_that_may_block();
++ *
++ *     if (six_relock_read(&foo->lock, seq)) { ... }
++ *
++ *   If the relock operation succeeds, it is as if the lock was never unlocked.
++ *
++ * Reentrancy:
++ *
++ *   Six locks are not by themselves reentrent, but have counters for both the
++ *   read and intent states that can be used to provide reentrency by an upper
++ *   layer that tracks held locks. If a lock is known to already be held in the
++ *   read or intent state, six_lock_increment() can be used to bump the "lock
++ *   held in this state" counter, increasing the number of unlock calls that
++ *   will be required to fully unlock it.
++ *
++ *   Example usage:
++ *     six_lock_read(&foo->lock);
++ *     six_lock_increment(&foo->lock, SIX_LOCK_read);
++ *     six_unlock_read(&foo->lock);
++ *     six_unlock_read(&foo->lock);
++ *   foo->lock is now fully unlocked.
++ *
++ *   Since the intent state supercedes read, it's legal to increment the read
++ *   counter when holding an intent lock, but not the reverse.
++ *
++ *   A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
++ *   is not legal.
++ *
++ * should_sleep_fn:
++ *
++ *   There is a six_lock() variant that takes a function pointer that is called
++ *   immediately prior to schedule() when blocking, and may return an error to
++ *   abort.
++ *
++ *   One possible use for this feature is when objects being locked are part of
++ *   a cache and may reused, and lock ordering is based on a property of the
++ *   object that will change when the object is reused - i.e. logical key order.
++ *
++ *   If looking up an object in the cache may race with object reuse, and lock
++ *   ordering is required to prevent deadlock, object reuse may change the
++ *   correct lock order for that object and cause a deadlock. should_sleep_fn
++ *   can be used to check if the object is still the object we want and avoid
++ *   this deadlock.
++ *
++ * Wait list entry interface:
++ *
++ *   There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
++ *   wait list entry. By embedding six_lock_waiter into another object, and by
++ *   traversing lock waitlists, it is then possible for an upper layer to
++ *   implement full cycle detection for deadlock avoidance.
++ *
++ *   should_sleep_fn should be used for invoking the cycle detector, walking the
++ *   graph of held locks to check for a deadlock. The upper layer must track
++ *   held locks for each thread, and each thread's held locks must be reachable
++ *   from its six_lock_waiter object.
++ *
++ *   six_lock_waiter() will add the wait object to the waitlist re-trying taking
++ *   the lock, and before calling should_sleep_fn, and the wait object will not
++ *   be removed from the waitlist until either the lock has been successfully
++ *   acquired, or we aborted because should_sleep_fn returned an error.
++ *
++ *   Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
++ *   have timestamps in strictly ascending order - this is so the timestamp can
++ *   be used as a cursor for lock graph traverse.
++ */
++
++#include <linux/lockdep.h>
++#include <linux/osq_lock.h>
++#include <linux/sched.h>
++#include <linux/types.h>
++
++enum six_lock_type {
++	SIX_LOCK_read,
++	SIX_LOCK_intent,
++	SIX_LOCK_write,
++};
++
++struct six_lock {
++	atomic_t		state;
++	u32			seq;
++	unsigned		intent_lock_recurse;
++	struct task_struct	*owner;
++	unsigned __percpu	*readers;
++	struct optimistic_spin_queue osq;
++	raw_spinlock_t		wait_lock;
++	struct list_head	wait_list;
++#ifdef CONFIG_DEBUG_LOCK_ALLOC
++	struct lockdep_map	dep_map;
++#endif
++};
++
++struct six_lock_waiter {
++	struct list_head	list;
++	struct task_struct	*task;
++	enum six_lock_type	lock_want;
++	bool			lock_acquired;
++	u64			start_time;
++};
++
++typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
++
++void six_lock_exit(struct six_lock *lock);
++
++enum six_lock_init_flags {
++	SIX_LOCK_INIT_PCPU	= 1U << 0,
++};
++
++void __six_lock_init(struct six_lock *lock, const char *name,
++		     struct lock_class_key *key, enum six_lock_init_flags flags);
++
++/**
++ * six_lock_init - initialize a six lock
++ * @lock:	lock to initialize
++ * @flags:	optional flags, i.e. SIX_LOCK_INIT_PCPU
++ */
++#define six_lock_init(lock, flags)					\
++do {									\
++	static struct lock_class_key __key;				\
++									\
++	__six_lock_init((lock), #lock, &__key, flags);			\
++} while (0)
++
++/**
++ * six_lock_seq - obtain current lock sequence number
++ * @lock:	six_lock to obtain sequence number for
++ *
++ * @lock should be held for read or intent, and not write
++ *
++ * By saving the lock sequence number, we can unlock @lock and then (typically
++ * after some blocking operation) attempt to relock it: the relock will succeed
++ * if the sequence number hasn't changed, meaning no write locks have been taken
++ * and state corresponding to what @lock protects is still valid.
++ */
++static inline u32 six_lock_seq(const struct six_lock *lock)
++{
++	return lock->seq;
++}
++
++bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
++
++/**
++ * six_trylock_type - attempt to take a six lock without blocking
++ * @lock:	lock to take
++ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
++ *
++ * Return: true on success, false on failure.
++ */
++static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
++{
++	return six_trylock_ip(lock, type, _THIS_IP_);
++}
++
++int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
++		       struct six_lock_waiter *wait,
++		       six_lock_should_sleep_fn should_sleep_fn, void *p,
++		       unsigned long ip);
++
++/**
++ * six_lock_waiter - take a lock, with full waitlist interface
++ * @lock:	lock to take
++ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
++ * @wait:	pointer to wait object, which will be added to lock's waitlist
++ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
++ *		to scheduling
++ * @p:		passed through to @should_sleep_fn
++ *
++ * This is a convenience wrapper around six_lock_ip_waiter(), see that function
++ * for full documentation.
++ *
++ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
++ */
++static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
++				  struct six_lock_waiter *wait,
++				  six_lock_should_sleep_fn should_sleep_fn, void *p)
++{
++	return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
++}
++
++/**
++ * six_lock_ip - take a six lock lock
++ * @lock:	lock to take
++ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
++ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
++ *		to scheduling
++ * @p:		passed through to @should_sleep_fn
++ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
++ *
++ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
++ */
++static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
++			      six_lock_should_sleep_fn should_sleep_fn, void *p,
++			      unsigned long ip)
++{
++	struct six_lock_waiter wait;
++
++	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
++}
++
++/**
++ * six_lock_type - take a six lock lock
++ * @lock:	lock to take
++ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
++ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
++ *		to scheduling
++ * @p:		passed through to @should_sleep_fn
++ *
++ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
++ */
++static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
++				six_lock_should_sleep_fn should_sleep_fn, void *p)
++{
++	struct six_lock_waiter wait;
++
++	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
++}
++
++bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
++		   unsigned seq, unsigned long ip);
++
++/**
++ * six_relock_type - attempt to re-take a lock that was held previously
++ * @lock:	lock to take
++ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
++ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
++ *		held previously
++ *
++ * Return: true on success, false on failure.
++ */
++static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
++				   unsigned seq)
++{
++	return six_relock_ip(lock, type, seq, _THIS_IP_);
++}
++
++void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
++
++/**
++ * six_unlock_type - drop a six lock
++ * @lock:	lock to unlock
++ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
++ *
++ * When a lock is held multiple times (because six_lock_incement()) was used),
++ * this decrements the 'lock held' counter by one.
++ *
++ * For example:
++ * six_lock_read(&foo->lock);				read count 1
++ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
++ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
++ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
++ */
++static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
++{
++	six_unlock_ip(lock, type, _THIS_IP_);
++}
++
++#define __SIX_LOCK(type)						\
++static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
++{									\
++	return six_trylock_ip(lock, SIX_LOCK_##type, ip);		\
++}									\
++									\
++static inline bool six_trylock_##type(struct six_lock *lock)		\
++{									\
++	return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_);	\
++}									\
++									\
++static inline int six_lock_ip_waiter_##type(struct six_lock *lock,	\
++			   struct six_lock_waiter *wait,		\
++			   six_lock_should_sleep_fn should_sleep_fn, void *p,\
++			   unsigned long ip)				\
++{									\
++	return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
++}									\
++									\
++static inline int six_lock_ip_##type(struct six_lock *lock,		\
++		    six_lock_should_sleep_fn should_sleep_fn, void *p,	\
++		    unsigned long ip)					\
++{									\
++	return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
++}									\
++									\
++static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
++{									\
++	return six_relock_ip(lock, SIX_LOCK_##type, seq, ip);		\
++}									\
++									\
++static inline bool six_relock_##type(struct six_lock *lock, u32 seq)	\
++{									\
++	return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_);	\
++}									\
++									\
++static inline int six_lock_##type(struct six_lock *lock,		\
++				  six_lock_should_sleep_fn fn, void *p)\
++{									\
++	return six_lock_ip_##type(lock, fn, p, _THIS_IP_);		\
++}									\
++									\
++static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)	\
++{									\
++	six_unlock_ip(lock, SIX_LOCK_##type, ip);			\
++}									\
++									\
++static inline void six_unlock_##type(struct six_lock *lock)		\
++{									\
++	six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_);		\
++}
++
++__SIX_LOCK(read)
++__SIX_LOCK(intent)
++__SIX_LOCK(write)
++#undef __SIX_LOCK
++
++void six_lock_downgrade(struct six_lock *);
++bool six_lock_tryupgrade(struct six_lock *);
++bool six_trylock_convert(struct six_lock *, enum six_lock_type,
++			 enum six_lock_type);
++
++void six_lock_increment(struct six_lock *, enum six_lock_type);
++
++void six_lock_wakeup_all(struct six_lock *);
++
++struct six_lock_count {
++	unsigned n[3];
++};
++
++struct six_lock_count six_lock_counts(struct six_lock *);
++void six_lock_readers_add(struct six_lock *, int);
++
++#endif /* _LINUX_SIX_H */
+diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
+new file mode 100644
+index 000000000..9da099114
+--- /dev/null
++++ b/fs/bcachefs/snapshot.c
+@@ -0,0 +1,1687 @@
++// SPDX-License-Identifier: GPL-2.0
++
++#include "bcachefs.h"
++#include "bkey_buf.h"
++#include "btree_key_cache.h"
++#include "btree_update.h"
++#include "buckets.h"
++#include "errcode.h"
++#include "error.h"
++#include "fs.h"
++#include "snapshot.h"
++
++#include <linux/random.h>
++
++/*
++ * Snapshot trees:
++ *
++ * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
++ * exist to provide a stable identifier for the whole lifetime of a snapshot
++ * tree.
++ */
++
++void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
++				struct bkey_s_c k)
++{
++	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
++
++	prt_printf(out, "subvol %u root snapshot %u",
++		   le32_to_cpu(t.v->master_subvol),
++		   le32_to_cpu(t.v->root_snapshot));
++}
++
++int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			       enum bkey_invalid_flags flags,
++			       struct printbuf *err)
++{
++	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
++	    bkey_lt(k.k->p, POS(0, 1))) {
++		prt_printf(err, "bad pos");
++		return -BCH_ERR_invalid_bkey;
++	}
++
++	return 0;
++}
++
++int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
++			      struct bch_snapshot_tree *s)
++{
++	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
++					  BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
++
++	if (bch2_err_matches(ret, ENOENT))
++		ret = -BCH_ERR_ENOENT_snapshot_tree;
++	return ret;
++}
++
++struct bkey_i_snapshot_tree *
++__bch2_snapshot_tree_create(struct btree_trans *trans)
++{
++	struct btree_iter iter;
++	int ret = bch2_bkey_get_empty_slot(trans, &iter,
++			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
++	struct bkey_i_snapshot_tree *s_t;
++
++	if (ret == -BCH_ERR_ENOSPC_btree_slot)
++		ret = -BCH_ERR_ENOSPC_snapshot_tree;
++	if (ret)
++		return ERR_PTR(ret);
++
++	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
++	ret = PTR_ERR_OR_ZERO(s_t);
++	bch2_trans_iter_exit(trans, &iter);
++	return ret ? ERR_PTR(ret) : s_t;
++}
++
++static int bch2_snapshot_tree_create(struct btree_trans *trans,
++				u32 root_id, u32 subvol_id, u32 *tree_id)
++{
++	struct bkey_i_snapshot_tree *n_tree =
++		__bch2_snapshot_tree_create(trans);
++
++	if (IS_ERR(n_tree))
++		return PTR_ERR(n_tree);
++
++	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
++	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
++	*tree_id = n_tree->k.p.offset;
++	return 0;
++}
++
++/* Snapshot nodes: */
++
++static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
++{
++	struct snapshot_table *t;
++
++	rcu_read_lock();
++	t = rcu_dereference(c->snapshots);
++
++	while (id && id < ancestor)
++		id = __snapshot_t(t, id)->parent;
++	rcu_read_unlock();
++
++	return id == ancestor;
++}
++
++static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
++{
++	const struct snapshot_t *s = __snapshot_t(t, id);
++
++	if (s->skip[2] <= ancestor)
++		return s->skip[2];
++	if (s->skip[1] <= ancestor)
++		return s->skip[1];
++	if (s->skip[0] <= ancestor)
++		return s->skip[0];
++	return s->parent;
++}
++
++bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
++{
++	struct snapshot_table *t;
++	bool ret;
++
++	EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
++
++	rcu_read_lock();
++	t = rcu_dereference(c->snapshots);
++
++	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
++		id = get_ancestor_below(t, id, ancestor);
++
++	if (id && id < ancestor) {
++		ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
++
++		EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
++	} else {
++		ret = id == ancestor;
++	}
++
++	rcu_read_unlock();
++
++	return ret;
++}
++
++struct snapshot_t_free_rcu {
++	struct rcu_head		rcu;
++	struct snapshot_table	*t;
++};
++
++static void snapshot_t_free_rcu(struct rcu_head *rcu)
++{
++	struct snapshot_t_free_rcu *free_rcu =
++		container_of(rcu, struct snapshot_t_free_rcu, rcu);
++
++	kvfree(free_rcu->t);
++	kfree(free_rcu);
++}
++
++static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
++{
++	size_t idx = U32_MAX - id;
++	size_t new_size;
++	struct snapshot_table *new, *old;
++
++	new_size = max(16UL, roundup_pow_of_two(idx + 1));
++
++	new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
++	if (!new)
++		return NULL;
++
++	old = rcu_dereference_protected(c->snapshots, true);
++	if (old)
++		memcpy(new->s,
++		       rcu_dereference_protected(c->snapshots, true)->s,
++		       sizeof(new->s[0]) * c->snapshot_table_size);
++
++	rcu_assign_pointer(c->snapshots, new);
++	c->snapshot_table_size = new_size;
++	if (old) {
++		struct snapshot_t_free_rcu *rcu =
++			kmalloc(sizeof(*rcu), GFP_KERNEL|__GFP_NOFAIL);
++
++		rcu->t = old;
++		call_rcu(&rcu->rcu, snapshot_t_free_rcu);
++	}
++
++	return &rcu_dereference_protected(c->snapshots, true)->s[idx];
++}
++
++static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
++{
++	size_t idx = U32_MAX - id;
++
++	lockdep_assert_held(&c->snapshot_table_lock);
++
++	if (likely(idx < c->snapshot_table_size))
++		return &rcu_dereference_protected(c->snapshots, true)->s[idx];
++
++	return __snapshot_t_mut(c, id);
++}
++
++void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
++			   struct bkey_s_c k)
++{
++	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
++
++	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
++	       BCH_SNAPSHOT_SUBVOL(s.v),
++	       BCH_SNAPSHOT_DELETED(s.v),
++	       le32_to_cpu(s.v->parent),
++	       le32_to_cpu(s.v->children[0]),
++	       le32_to_cpu(s.v->children[1]),
++	       le32_to_cpu(s.v->subvol),
++	       le32_to_cpu(s.v->tree));
++
++	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
++		prt_printf(out, " depth %u skiplist %u %u %u",
++			   le32_to_cpu(s.v->depth),
++			   le32_to_cpu(s.v->skip[0]),
++			   le32_to_cpu(s.v->skip[1]),
++			   le32_to_cpu(s.v->skip[2]));
++}
++
++int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
++			  enum bkey_invalid_flags flags,
++			  struct printbuf *err)
++{
++	struct bkey_s_c_snapshot s;
++	u32 i, id;
++
++	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
++	    bkey_lt(k.k->p, POS(0, 1))) {
++		prt_printf(err, "bad pos");
++		return -BCH_ERR_invalid_bkey;
++	}
++
++	s = bkey_s_c_to_snapshot(k);
++
++	id = le32_to_cpu(s.v->parent);
++	if (id && id <= k.k->p.offset) {
++		prt_printf(err, "bad parent node (%u <= %llu)",
++		       id, k.k->p.offset);
++		return -BCH_ERR_invalid_bkey;
++	}
++
++	if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
++		prt_printf(err, "children not normalized");
++		return -BCH_ERR_invalid_bkey;
++	}
++
++	if (s.v->children[0] &&
++	    s.v->children[0] == s.v->children[1]) {
++		prt_printf(err, "duplicate child nodes");
++		return -BCH_ERR_invalid_bkey;
++	}
++
++	for (i = 0; i < 2; i++) {
++		id = le32_to_cpu(s.v->children[i]);
++
++		if (id >= k.k->p.offset) {
++			prt_printf(err, "bad child node (%u >= %llu)",
++			       id, k.k->p.offset);
++			return -BCH_ERR_invalid_bkey;
++		}
++	}
++
++	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
++		if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
++		    le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
++			prt_printf(err, "skiplist not normalized");
++			return -BCH_ERR_invalid_bkey;
++		}
++
++		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
++			id = le32_to_cpu(s.v->skip[i]);
++
++			if ((id && !s.v->parent) ||
++			    (id && id <= k.k->p.offset)) {
++				prt_printf(err, "bad skiplist node %u", id);
++				return -BCH_ERR_invalid_bkey;
++			}
++		}
++	}
++
++	return 0;
++}
++
++static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
++{
++	struct snapshot_t *t = snapshot_t_mut(c, id);
++	u32 parent = id;
++
++	while ((parent = bch2_snapshot_parent_early(c, parent)) &&
++	       parent - id - 1 < IS_ANCESTOR_BITMAP)
++		__set_bit(parent - id - 1, t->is_ancestor);
++}
++
++static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
++{
++	mutex_lock(&c->snapshot_table_lock);
++	__set_is_ancestor_bitmap(c, id);
++	mutex_unlock(&c->snapshot_table_lock);
++}
++
++int bch2_mark_snapshot(struct btree_trans *trans,
++		       enum btree_id btree, unsigned level,
++		       struct bkey_s_c old, struct bkey_s_c new,
++		       unsigned flags)
++{
++	struct bch_fs *c = trans->c;
++	struct snapshot_t *t;
++	u32 id = new.k->p.offset;
++	int ret = 0;
++
++	mutex_lock(&c->snapshot_table_lock);
++
++	t = snapshot_t_mut(c, id);
++	if (!t) {
++		ret = -BCH_ERR_ENOMEM_mark_snapshot;
++		goto err;
++	}
++
++	if (new.k->type == KEY_TYPE_snapshot) {
++		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
++
++		t->parent	= le32_to_cpu(s.v->parent);
++		t->children[0]	= le32_to_cpu(s.v->children[0]);
++		t->children[1]	= le32_to_cpu(s.v->children[1]);
++		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
++		t->tree		= le32_to_cpu(s.v->tree);
++
++		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
++			t->depth	= le32_to_cpu(s.v->depth);
++			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
++			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
++			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
++		} else {
++			t->depth	= 0;
++			t->skip[0]	= 0;
++			t->skip[1]	= 0;
++			t->skip[2]	= 0;
++		}
++
++		__set_is_ancestor_bitmap(c, id);
++
++		if (BCH_SNAPSHOT_DELETED(s.v)) {
++			set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
++			c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots);
++		}
++	} else {
++		memset(t, 0, sizeof(*t));
++	}
++err:
++	mutex_unlock(&c->snapshot_table_lock);
++	return ret;
++}
++
++int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
++			 struct bch_snapshot *s)
++{
++	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
++				       BTREE_ITER_WITH_UPDATES, snapshot, s);
++}
++
++int bch2_snapshot_live(struct btree_trans *trans, u32 id)
++{
++	struct bch_snapshot v;
++	int ret;
++
++	if (!id)
++		return 0;
++
++	ret = bch2_snapshot_lookup(trans, id, &v);
++	if (bch2_err_matches(ret, ENOENT))
++		bch_err(trans->c, "snapshot node %u not found", id);
++	if (ret)
++		return ret;
++
++	return !BCH_SNAPSHOT_DELETED(&v);
++}
++
++/*
++ * If @k is a snapshot with just one live child, it's part of a linear chain,
++ * which we consider to be an equivalence class: and then after snapshot
++ * deletion cleanup, there should only be a single key at a given position in
++ * this equivalence class.
++ *
++ * This sets the equivalence class of @k to be the child's equivalence class, if
++ * it's part of such a linear chain: this correctly sets equivalence classes on
++ * startup if we run leaf to root (i.e. in natural key order).
++ */
++int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
++{
++	struct bch_fs *c = trans->c;
++	unsigned i, nr_live = 0, live_idx = 0;
++	struct bkey_s_c_snapshot snap;
++	u32 id = k.k->p.offset, child[2];
++
++	if (k.k->type != KEY_TYPE_snapshot)
++		return 0;
++
++	snap = bkey_s_c_to_snapshot(k);
++
++	child[0] = le32_to_cpu(snap.v->children[0]);
++	child[1] = le32_to_cpu(snap.v->children[1]);
++
++	for (i = 0; i < 2; i++) {
++		int ret = bch2_snapshot_live(trans, child[i]);
++
++		if (ret < 0)
++			return ret;
++
++		if (ret)
++			live_idx = i;
++		nr_live += ret;
++	}
++
++	mutex_lock(&c->snapshot_table_lock);
++
++	snapshot_t_mut(c, id)->equiv = nr_live == 1
++		? snapshot_t_mut(c, child[live_idx])->equiv
++		: id;
++
++	mutex_unlock(&c->snapshot_table_lock);
++
++	return 0;
++}
++
++/* fsck: */
++
++static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
++{
++	return snapshot_t(c, id)->children[child];
++}
++
++static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
++{
++	return bch2_snapshot_child(c, id, 0);
++}
++
++static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
++{
++	return bch2_snapshot_child(c, id, 1);
++}
++
++static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
++{
++	u32 n, parent;
++
++	n = bch2_snapshot_left_child(c, id);
++	if (n)
++		return n;
++
++	while ((parent = bch2_snapshot_parent(c, id))) {
++		n = bch2_snapshot_right_child(c, parent);
++		if (n && n != id)
++			return n;
++		id = parent;
++	}
++
++	return 0;
++}
++
++static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
++{
++	u32 id = snapshot_root;
++	u32 subvol = 0, s;
++
++	while (id) {
++		s = snapshot_t(c, id)->subvol;
++
++		if (s && (!subvol || s < subvol))
++			subvol = s;
++
++		id = bch2_snapshot_tree_next(c, id);
++	}
++
++	return subvol;
++}
++
++static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
++					    u32 snapshot_root, u32 *subvol_id)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_s_c_subvolume s;
++	bool found = false;
++	int ret;
++
++	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
++				     0, k, ret) {
++		if (k.k->type != KEY_TYPE_subvolume)
++			continue;
++
++		s = bkey_s_c_to_subvolume(k);
++		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
++			continue;
++		if (!BCH_SUBVOLUME_SNAP(s.v)) {
++			*subvol_id = s.k->p.offset;
++			found = true;
++			break;
++		}
++	}
++
++	bch2_trans_iter_exit(trans, &iter);
++
++	if (!ret && !found) {
++		struct bkey_i_subvolume *s;
++
++		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
++
++		s = bch2_bkey_get_mut_typed(trans, &iter,
++					    BTREE_ID_subvolumes, POS(0, *subvol_id),
++					    0, subvolume);
++		ret = PTR_ERR_OR_ZERO(s);
++		if (ret)
++			return ret;
++
++		SET_BCH_SUBVOLUME_SNAP(&s->v, false);
++	}
++
++	return ret;
++}
++
++static int check_snapshot_tree(struct btree_trans *trans,
++			       struct btree_iter *iter,
++			       struct bkey_s_c k)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_s_c_snapshot_tree st;
++	struct bch_snapshot s;
++	struct bch_subvolume subvol;
++	struct printbuf buf = PRINTBUF;
++	u32 root_id;
++	int ret;
++
++	if (k.k->type != KEY_TYPE_snapshot_tree)
++		return 0;
++
++	st = bkey_s_c_to_snapshot_tree(k);
++	root_id = le32_to_cpu(st.v->root_snapshot);
++
++	ret = bch2_snapshot_lookup(trans, root_id, &s);
++	if (ret && !bch2_err_matches(ret, ENOENT))
++		goto err;
++
++	if (fsck_err_on(ret ||
++			root_id != bch2_snapshot_root(c, root_id) ||
++			st.k->p.offset != le32_to_cpu(s.tree),
++			c,
++			"snapshot tree points to missing/incorrect snapshot:\n  %s",
++			(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
++		ret = bch2_btree_delete_at(trans, iter, 0);
++		goto err;
++	}
++
++	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
++				 false, 0, &subvol);
++	if (ret && !bch2_err_matches(ret, ENOENT))
++		goto err;
++
++	if (fsck_err_on(ret, c,
++			"snapshot tree points to missing subvolume:\n  %s",
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
++	    fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
++						le32_to_cpu(subvol.snapshot),
++						root_id), c,
++			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
++	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
++			"snapshot tree points to snapshot subvolume:\n  %s",
++			(printbuf_reset(&buf),
++			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
++		struct bkey_i_snapshot_tree *u;
++		u32 subvol_id;
++
++		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
++		if (ret)
++			goto err;
++
++		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
++		ret = PTR_ERR_OR_ZERO(u);
++		if (ret)
++			goto err;
++
++		u->v.master_subvol = cpu_to_le32(subvol_id);
++		st = snapshot_tree_i_to_s_c(u);
++	}
++err:
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++/*
++ * For each snapshot_tree, make sure it points to the root of a snapshot tree
++ * and that snapshot entry points back to it, or delete it.
++ *
++ * And, make sure it points to a subvolume within that snapshot tree, or correct
++ * it to point to the oldest subvolume within that snapshot tree.
++ */
++int bch2_check_snapshot_trees(struct bch_fs *c)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	ret = bch2_trans_run(c,
++		for_each_btree_key_commit(&trans, iter,
++			BTREE_ID_snapshot_trees, POS_MIN,
++			BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_snapshot_tree(&trans, &iter, k)));
++
++	if (ret)
++		bch_err(c, "error %i checking snapshot trees", ret);
++	return ret;
++}
++
++/*
++ * Look up snapshot tree for @tree_id and find root,
++ * make sure @snap_id is a descendent:
++ */
++static int snapshot_tree_ptr_good(struct btree_trans *trans,
++				  u32 snap_id, u32 tree_id)
++{
++	struct bch_snapshot_tree s_t;
++	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
++
++	if (bch2_err_matches(ret, ENOENT))
++		return 0;
++	if (ret)
++		return ret;
++
++	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
++}
++
++u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
++{
++	const struct snapshot_t *s;
++
++	if (!id)
++		return 0;
++
++	rcu_read_lock();
++	s = snapshot_t(c, id);
++	if (s->parent)
++		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
++	rcu_read_unlock();
++
++	return id;
++}
++
++static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
++{
++	unsigned i;
++
++	for (i = 0; i < 3; i++)
++		if (!s.parent) {
++			if (s.skip[i])
++				return false;
++		} else {
++			if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
++				return false;
++		}
++
++	return true;
++}
++
++/*
++ * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
++ * its snapshot_tree pointer is correct (allocate new one if necessary), then
++ * update this node's pointer to root node's pointer:
++ */
++static int snapshot_tree_ptr_repair(struct btree_trans *trans,
++				    struct btree_iter *iter,
++				    struct bkey_s_c k,
++				    struct bch_snapshot *s)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter root_iter;
++	struct bch_snapshot_tree s_t;
++	struct bkey_s_c_snapshot root;
++	struct bkey_i_snapshot *u;
++	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
++	int ret;
++
++	root = bch2_bkey_get_iter_typed(trans, &root_iter,
++			       BTREE_ID_snapshots, POS(0, root_id),
++			       BTREE_ITER_WITH_UPDATES, snapshot);
++	ret = bkey_err(root);
++	if (ret)
++		goto err;
++
++	tree_id = le32_to_cpu(root.v->tree);
++
++	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
++	if (ret && !bch2_err_matches(ret, ENOENT))
++		return ret;
++
++	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
++		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
++		ret =   PTR_ERR_OR_ZERO(u) ?:
++			bch2_snapshot_tree_create(trans, root_id,
++				bch2_snapshot_tree_oldest_subvol(c, root_id),
++				&tree_id);
++		if (ret)
++			goto err;
++
++		u->v.tree = cpu_to_le32(tree_id);
++		if (k.k->p.offset == root_id)
++			*s = u->v;
++	}
++
++	if (k.k->p.offset != root_id) {
++		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
++		ret = PTR_ERR_OR_ZERO(u);
++		if (ret)
++			goto err;
++
++		u->v.tree = cpu_to_le32(tree_id);
++		*s = u->v;
++	}
++err:
++	bch2_trans_iter_exit(trans, &root_iter);
++	return ret;
++}
++
++static int check_snapshot(struct btree_trans *trans,
++			  struct btree_iter *iter,
++			  struct bkey_s_c k)
++{
++	struct bch_fs *c = trans->c;
++	struct bch_snapshot s;
++	struct bch_subvolume subvol;
++	struct bch_snapshot v;
++	struct bkey_i_snapshot *u;
++	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
++	u32 real_depth;
++	struct printbuf buf = PRINTBUF;
++	bool should_have_subvol;
++	u32 i, id;
++	int ret = 0;
++
++	if (k.k->type != KEY_TYPE_snapshot)
++		return 0;
++
++	memset(&s, 0, sizeof(s));
++	memcpy(&s, k.v, bkey_val_bytes(k.k));
++
++	id = le32_to_cpu(s.parent);
++	if (id) {
++		ret = bch2_snapshot_lookup(trans, id, &v);
++		if (bch2_err_matches(ret, ENOENT))
++			bch_err(c, "snapshot with nonexistent parent:\n  %s",
++				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
++		if (ret)
++			goto err;
++
++		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
++		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
++			bch_err(c, "snapshot parent %u missing pointer to child %llu",
++				id, k.k->p.offset);
++			ret = -EINVAL;
++			goto err;
++		}
++	}
++
++	for (i = 0; i < 2 && s.children[i]; i++) {
++		id = le32_to_cpu(s.children[i]);
++
++		ret = bch2_snapshot_lookup(trans, id, &v);
++		if (bch2_err_matches(ret, ENOENT))
++			bch_err(c, "snapshot node %llu has nonexistent child %u",
++				k.k->p.offset, id);
++		if (ret)
++			goto err;
++
++		if (le32_to_cpu(v.parent) != k.k->p.offset) {
++			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
++				id, le32_to_cpu(v.parent), k.k->p.offset);
++			ret = -EINVAL;
++			goto err;
++		}
++	}
++
++	should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
++		!BCH_SNAPSHOT_DELETED(&s);
++
++	if (should_have_subvol) {
++		id = le32_to_cpu(s.subvol);
++		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
++		if (bch2_err_matches(ret, ENOENT))
++			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
++				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
++		if (ret)
++			goto err;
++
++		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
++			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
++				k.k->p.offset);
++			ret = -EINVAL;
++			goto err;
++		}
++	} else {
++		if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n  %s",
++				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
++			ret = PTR_ERR_OR_ZERO(u);
++			if (ret)
++				goto err;
++
++			u->v.subvol = 0;
++			s = u->v;
++		}
++	}
++
++	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
++	if (ret < 0)
++		goto err;
++
++	if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n  %s",
++			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
++		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
++		if (ret)
++			goto err;
++	}
++	ret = 0;
++
++	real_depth = bch2_snapshot_depth(c, parent_id);
++
++	if (le32_to_cpu(s.depth) != real_depth &&
++	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
++	     fsck_err(c, "snapshot with incorrect depth field, should be %u:\n  %s",
++		      real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
++		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
++		ret = PTR_ERR_OR_ZERO(u);
++		if (ret)
++			goto err;
++
++		u->v.depth = cpu_to_le32(real_depth);
++		s = u->v;
++	}
++
++	ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
++	if (ret < 0)
++		goto err;
++
++	if (!ret &&
++	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
++	     fsck_err(c, "snapshot with bad skiplist field:\n  %s",
++		      (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
++		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
++		ret = PTR_ERR_OR_ZERO(u);
++		if (ret)
++			goto err;
++
++		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
++			u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
++
++		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
++		s = u->v;
++	}
++	ret = 0;
++err:
++fsck_err:
++	printbuf_exit(&buf);
++	return ret;
++}
++
++int bch2_check_snapshots(struct bch_fs *c)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	/*
++	 * We iterate backwards as checking/fixing the depth field requires that
++	 * the parent's depth already be correct:
++	 */
++	ret = bch2_trans_run(c,
++		for_each_btree_key_reverse_commit(&trans, iter,
++			BTREE_ID_snapshots, POS_MAX,
++			BTREE_ITER_PREFETCH, k,
++			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
++		check_snapshot(&trans, &iter, k)));
++	if (ret)
++		bch_err_fn(c, ret);
++	return ret;
++}
++
++/*
++ * Mark a snapshot as deleted, for future cleanup:
++ */
++int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
++{
++	struct btree_iter iter;
++	struct bkey_i_snapshot *s;
++	int ret = 0;
++
++	s = bch2_bkey_get_mut_typed(trans, &iter,
++				    BTREE_ID_snapshots, POS(0, id),
++				    0, snapshot);
++	ret = PTR_ERR_OR_ZERO(s);
++	if (unlikely(ret)) {
++		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
++					trans->c, "missing snapshot %u", id);
++		return ret;
++	}
++
++	/* already deleted? */
++	if (BCH_SNAPSHOT_DELETED(&s->v))
++		goto err;
++
++	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
++	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
++	s->v.subvol = 0;
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
++{
++	if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
++		swap(s->children[0], s->children[1]);
++}
++
++int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
++	struct btree_iter c_iter = (struct btree_iter) { NULL };
++	struct btree_iter tree_iter = (struct btree_iter) { NULL };
++	struct bkey_s_c_snapshot s;
++	u32 parent_id, child_id;
++	unsigned i;
++	int ret = 0;
++
++	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
++				     BTREE_ITER_INTENT, snapshot);
++	ret = bkey_err(s);
++	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
++				"missing snapshot %u", id);
++
++	if (ret)
++		goto err;
++
++	BUG_ON(s.v->children[1]);
++
++	parent_id = le32_to_cpu(s.v->parent);
++	child_id = le32_to_cpu(s.v->children[0]);
++
++	if (parent_id) {
++		struct bkey_i_snapshot *parent;
++
++		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
++				     BTREE_ID_snapshots, POS(0, parent_id),
++				     0, snapshot);
++		ret = PTR_ERR_OR_ZERO(parent);
++		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
++					"missing snapshot %u", parent_id);
++		if (unlikely(ret))
++			goto err;
++
++		/* find entry in parent->children for node being deleted */
++		for (i = 0; i < 2; i++)
++			if (le32_to_cpu(parent->v.children[i]) == id)
++				break;
++
++		if (bch2_fs_inconsistent_on(i == 2, c,
++					"snapshot %u missing child pointer to %u",
++					parent_id, id))
++			goto err;
++
++		parent->v.children[i] = le32_to_cpu(child_id);
++
++		normalize_snapshot_child_pointers(&parent->v);
++	}
++
++	if (child_id) {
++		struct bkey_i_snapshot *child;
++
++		child = bch2_bkey_get_mut_typed(trans, &c_iter,
++				     BTREE_ID_snapshots, POS(0, child_id),
++				     0, snapshot);
++		ret = PTR_ERR_OR_ZERO(child);
++		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
++					"missing snapshot %u", child_id);
++		if (unlikely(ret))
++			goto err;
++
++		child->v.parent = cpu_to_le32(parent_id);
++
++		if (!child->v.parent) {
++			child->v.skip[0] = 0;
++			child->v.skip[1] = 0;
++			child->v.skip[2] = 0;
++		}
++	}
++
++	if (!parent_id) {
++		/*
++		 * We're deleting the root of a snapshot tree: update the
++		 * snapshot_tree entry to point to the new root, or delete it if
++		 * this is the last snapshot ID in this tree:
++		 */
++		struct bkey_i_snapshot_tree *s_t;
++
++		BUG_ON(s.v->children[1]);
++
++		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
++				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
++				0, snapshot_tree);
++		ret = PTR_ERR_OR_ZERO(s_t);
++		if (ret)
++			goto err;
++
++		if (s.v->children[0]) {
++			s_t->v.root_snapshot = s.v->children[0];
++		} else {
++			s_t->k.type = KEY_TYPE_deleted;
++			set_bkey_val_u64s(&s_t->k, 0);
++		}
++	}
++
++	ret = bch2_btree_delete_at(trans, &iter, 0);
++err:
++	bch2_trans_iter_exit(trans, &tree_iter);
++	bch2_trans_iter_exit(trans, &p_iter);
++	bch2_trans_iter_exit(trans, &c_iter);
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
++			  u32 *new_snapids,
++			  u32 *snapshot_subvols,
++			  unsigned nr_snapids)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_i_snapshot *n;
++	struct bkey_s_c k;
++	unsigned i, j;
++	u32 depth = bch2_snapshot_depth(c, parent);
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
++			     POS_MIN, BTREE_ITER_INTENT);
++	k = bch2_btree_iter_peek(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto err;
++
++	for (i = 0; i < nr_snapids; i++) {
++		k = bch2_btree_iter_prev_slot(&iter);
++		ret = bkey_err(k);
++		if (ret)
++			goto err;
++
++		if (!k.k || !k.k->p.offset) {
++			ret = -BCH_ERR_ENOSPC_snapshot_create;
++			goto err;
++		}
++
++		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
++		ret = PTR_ERR_OR_ZERO(n);
++		if (ret)
++			goto err;
++
++		n->v.flags	= 0;
++		n->v.parent	= cpu_to_le32(parent);
++		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
++		n->v.tree	= cpu_to_le32(tree);
++		n->v.depth	= cpu_to_le32(depth);
++
++		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
++			n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
++
++		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
++		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
++
++		ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
++					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
++		if (ret)
++			goto err;
++
++		new_snapids[i]	= iter.pos.offset;
++	}
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++/*
++ * Create new snapshot IDs as children of an existing snapshot ID:
++ */
++static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
++			      u32 *new_snapids,
++			      u32 *snapshot_subvols,
++			      unsigned nr_snapids)
++{
++	struct btree_iter iter;
++	struct bkey_i_snapshot *n_parent;
++	int ret = 0;
++
++	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
++			BTREE_ID_snapshots, POS(0, parent),
++			0, snapshot);
++	ret = PTR_ERR_OR_ZERO(n_parent);
++	if (unlikely(ret)) {
++		if (bch2_err_matches(ret, ENOENT))
++			bch_err(trans->c, "snapshot %u not found", parent);
++		return ret;
++	}
++
++	if (n_parent->v.children[0] || n_parent->v.children[1]) {
++		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
++		ret = -EINVAL;
++		goto err;
++	}
++
++	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
++			     new_snapids, snapshot_subvols, nr_snapids);
++	if (ret)
++		goto err;
++
++	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
++	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
++	n_parent->v.subvol = 0;
++	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
++err:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++/*
++ * Create a snapshot node that is the root of a new tree:
++ */
++static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
++			      u32 *new_snapids,
++			      u32 *snapshot_subvols,
++			      unsigned nr_snapids)
++{
++	struct bkey_i_snapshot_tree *n_tree;
++	int ret;
++
++	n_tree = __bch2_snapshot_tree_create(trans);
++	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
++		create_snapids(trans, 0, n_tree->k.p.offset,
++			     new_snapids, snapshot_subvols, nr_snapids);
++	if (ret)
++		return ret;
++
++	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
++	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
++	return 0;
++}
++
++int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
++			      u32 *new_snapids,
++			      u32 *snapshot_subvols,
++			      unsigned nr_snapids)
++{
++	BUG_ON((parent == 0) != (nr_snapids == 1));
++	BUG_ON((parent != 0) != (nr_snapids == 2));
++
++	return parent
++		? bch2_snapshot_node_create_children(trans, parent,
++				new_snapids, snapshot_subvols, nr_snapids)
++		: bch2_snapshot_node_create_tree(trans,
++				new_snapids, snapshot_subvols, nr_snapids);
++
++}
++
++/*
++ * If we have an unlinked inode in an internal snapshot node, and the inode
++ * really has been deleted in all child snapshots, how does this get cleaned up?
++ *
++ * first there is the problem of how keys that have been overwritten in all
++ * child snapshots get deleted (unimplemented?), but inodes may perhaps be
++ * special?
++ *
++ * also: unlinked inode in internal snapshot appears to not be getting deleted
++ * correctly if inode doesn't exist in leaf snapshots
++ *
++ * solution:
++ *
++ * for a key in an interior snapshot node that needs work to be done that
++ * requires it to be mutated: iterate over all descendent leaf nodes and copy
++ * that key to snapshot leaf nodes, where we can mutate it
++ */
++
++static int snapshot_delete_key(struct btree_trans *trans,
++			       struct btree_iter *iter,
++			       struct bkey_s_c k,
++			       snapshot_id_list *deleted,
++			       snapshot_id_list *equiv_seen,
++			       struct bpos *last_pos)
++{
++	struct bch_fs *c = trans->c;
++	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
++
++	if (!bkey_eq(k.k->p, *last_pos))
++		equiv_seen->nr = 0;
++	*last_pos = k.k->p;
++
++	if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
++	    snapshot_list_has_id(equiv_seen, equiv)) {
++		return bch2_btree_delete_at(trans, iter,
++					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++	} else {
++		return snapshot_list_add(c, equiv_seen, equiv);
++	}
++}
++
++static int move_key_to_correct_snapshot(struct btree_trans *trans,
++			       struct btree_iter *iter,
++			       struct bkey_s_c k)
++{
++	struct bch_fs *c = trans->c;
++	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
++
++	/*
++	 * When we have a linear chain of snapshot nodes, we consider
++	 * those to form an equivalence class: we're going to collapse
++	 * them all down to a single node, and keep the leaf-most node -
++	 * which has the same id as the equivalence class id.
++	 *
++	 * If there are multiple keys in different snapshots at the same
++	 * position, we're only going to keep the one in the newest
++	 * snapshot - the rest have been overwritten and are redundant,
++	 * and for the key we're going to keep we need to move it to the
++	 * equivalance class ID if it's not there already.
++	 */
++	if (equiv != k.k->p.snapshot) {
++		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
++		struct btree_iter new_iter;
++		int ret;
++
++		ret = PTR_ERR_OR_ZERO(new);
++		if (ret)
++			return ret;
++
++		new->k.p.snapshot = equiv;
++
++		bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
++				     BTREE_ITER_ALL_SNAPSHOTS|
++				     BTREE_ITER_CACHED|
++				     BTREE_ITER_INTENT);
++
++		ret =   bch2_btree_iter_traverse(&new_iter) ?:
++			bch2_trans_update(trans, &new_iter, new,
++					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
++			bch2_btree_delete_at(trans, iter,
++					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
++		bch2_trans_iter_exit(trans, &new_iter);
++		if (ret)
++			return ret;
++	}
++
++	return 0;
++}
++
++/*
++ * For a given snapshot, if it doesn't have a subvolume that points to it, and
++ * it doesn't have child snapshot nodes - it's now redundant and we can mark it
++ * as deleted.
++ */
++static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
++					  struct bkey_s_c k)
++{
++	struct bkey_s_c_snapshot snap;
++	u32 children[2];
++	int ret;
++
++	if (k.k->type != KEY_TYPE_snapshot)
++		return 0;
++
++	snap = bkey_s_c_to_snapshot(k);
++	if (BCH_SNAPSHOT_DELETED(snap.v) ||
++	    BCH_SNAPSHOT_SUBVOL(snap.v))
++		return 0;
++
++	children[0] = le32_to_cpu(snap.v->children[0]);
++	children[1] = le32_to_cpu(snap.v->children[1]);
++
++	ret   = bch2_snapshot_live(trans, children[0]) ?:
++		bch2_snapshot_live(trans, children[1]);
++	if (ret < 0)
++		return ret;
++
++	if (!ret)
++		return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
++	return 0;
++}
++
++static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
++						snapshot_id_list *skip)
++{
++	rcu_read_lock();
++	while (n--) {
++		do {
++			id = __bch2_snapshot_parent(c, id);
++		} while (snapshot_list_has_id(skip, id));
++	}
++	rcu_read_unlock();
++
++	return id;
++}
++
++static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
++					      struct btree_iter *iter, struct bkey_s_c k,
++					      snapshot_id_list *deleted)
++{
++	struct bch_fs *c = trans->c;
++	u32 nr_deleted_ancestors = 0;
++	struct bkey_i_snapshot *s;
++	u32 *i;
++	int ret;
++
++	if (k.k->type != KEY_TYPE_snapshot)
++		return 0;
++
++	if (snapshot_list_has_id(deleted, k.k->p.offset))
++		return 0;
++
++	s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
++	ret = PTR_ERR_OR_ZERO(s);
++	if (ret)
++		return ret;
++
++	darray_for_each(*deleted, i)
++		nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
++
++	if (!nr_deleted_ancestors)
++		return 0;
++
++	le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
++
++	if (!s->v.depth) {
++		s->v.skip[0] = 0;
++		s->v.skip[1] = 0;
++		s->v.skip[2] = 0;
++	} else {
++		u32 depth = le32_to_cpu(s->v.depth);
++		u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
++
++		for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
++			u32 id = le32_to_cpu(s->v.skip[j]);
++
++			if (snapshot_list_has_id(deleted, id)) {
++				id = depth > 1
++					? bch2_snapshot_nth_parent_skip(c,
++							parent,
++							get_random_u32_below(depth - 1),
++							deleted)
++					: parent;
++				s->v.skip[j] = cpu_to_le32(id);
++			}
++		}
++
++		bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
++	}
++
++	return bch2_trans_update(trans, iter, &s->k_i, 0);
++}
++
++int bch2_delete_dead_snapshots(struct bch_fs *c)
++{
++	struct btree_trans trans;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	struct bkey_s_c_snapshot snap;
++	snapshot_id_list deleted = { 0 };
++	snapshot_id_list deleted_interior = { 0 };
++	u32 *i, id;
++	int ret = 0;
++
++	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
++		ret = bch2_fs_read_write_early(c);
++		if (ret) {
++			bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
++			return ret;
++		}
++	}
++
++	bch2_trans_init(&trans, c, 0, 0);
++
++	/*
++	 * For every snapshot node: If we have no live children and it's not
++	 * pointed to by a subvolume, delete it:
++	 */
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
++			POS_MIN, 0, k,
++			NULL, NULL, 0,
++		bch2_delete_redundant_snapshot(&trans, &iter, k));
++	if (ret) {
++		bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
++			   POS_MIN, 0, k,
++		bch2_snapshot_set_equiv(&trans, k));
++	if (ret) {
++		bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
++		goto err;
++	}
++
++	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
++			   POS_MIN, 0, k, ret) {
++		if (k.k->type != KEY_TYPE_snapshot)
++			continue;
++
++		snap = bkey_s_c_to_snapshot(k);
++		if (BCH_SNAPSHOT_DELETED(snap.v)) {
++			ret = snapshot_list_add(c, &deleted, k.k->p.offset);
++			if (ret)
++				break;
++		}
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	if (ret) {
++		bch_err_msg(c, ret, "walking snapshots");
++		goto err;
++	}
++
++	for (id = 0; id < BTREE_ID_NR; id++) {
++		struct bpos last_pos = POS_MIN;
++		snapshot_id_list equiv_seen = { 0 };
++		struct disk_reservation res = { 0 };
++
++		if (!btree_type_has_snapshots(id))
++			continue;
++
++		ret = for_each_btree_key_commit(&trans, iter,
++				id, POS_MIN,
++				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++				&res, NULL, BTREE_INSERT_NOFAIL,
++			snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
++		      for_each_btree_key_commit(&trans, iter,
++				id, POS_MIN,
++				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
++				&res, NULL, BTREE_INSERT_NOFAIL,
++			move_key_to_correct_snapshot(&trans, &iter, k));
++
++		bch2_disk_reservation_put(c, &res);
++		darray_exit(&equiv_seen);
++
++		if (ret) {
++			bch_err_msg(c, ret, "deleting keys from dying snapshots");
++			goto err;
++		}
++	}
++
++	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
++			   POS_MIN, 0, k, ret) {
++		u32 snapshot = k.k->p.offset;
++		u32 equiv = bch2_snapshot_equiv(c, snapshot);
++
++		if (equiv != snapshot)
++			snapshot_list_add(c, &deleted_interior, snapshot);
++	}
++	bch2_trans_iter_exit(&trans, &iter);
++
++	/*
++	 * Fixing children of deleted snapshots can't be done completely
++	 * atomically, if we crash between here and when we delete the interior
++	 * nodes some depth fields will be off:
++	 */
++	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, POS_MIN,
++				  BTREE_ITER_INTENT, k,
++				  NULL, NULL, BTREE_INSERT_NOFAIL,
++		bch2_fix_child_of_deleted_snapshot(&trans, &iter, k, &deleted_interior));
++	if (ret)
++		goto err;
++
++	darray_for_each(deleted, i) {
++		ret = commit_do(&trans, NULL, NULL, 0,
++			bch2_snapshot_node_delete(&trans, *i));
++		if (ret) {
++			bch_err_msg(c, ret, "deleting snapshot %u", *i);
++			goto err;
++		}
++	}
++
++	darray_for_each(deleted_interior, i) {
++		ret = commit_do(&trans, NULL, NULL, 0,
++			bch2_snapshot_node_delete(&trans, *i));
++		if (ret) {
++			bch_err_msg(c, ret, "deleting snapshot %u", *i);
++			goto err;
++		}
++	}
++
++	clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
++err:
++	darray_exit(&deleted_interior);
++	darray_exit(&deleted);
++	bch2_trans_exit(&trans);
++	if (ret)
++		bch_err_fn(c, ret);
++	return ret;
++}
++
++void bch2_delete_dead_snapshots_work(struct work_struct *work)
++{
++	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
++
++	if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
++		bch2_delete_dead_snapshots(c);
++	bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
++}
++
++void bch2_delete_dead_snapshots_async(struct bch_fs *c)
++{
++	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
++	    !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
++		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
++}
++
++int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
++				    struct btree_trans_commit_hook *h)
++{
++	struct bch_fs *c = trans->c;
++
++	set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
++
++	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
++		return 0;
++
++	bch2_delete_dead_snapshots_async(c);
++	return 0;
++}
++
++int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
++				       enum btree_id id,
++				       struct bpos pos)
++{
++	struct bch_fs *c = trans->c;
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret;
++
++	bch2_trans_iter_init(trans, &iter, id, pos,
++			     BTREE_ITER_NOT_EXTENTS|
++			     BTREE_ITER_ALL_SNAPSHOTS);
++	while (1) {
++		k = bch2_btree_iter_prev(&iter);
++		ret = bkey_err(k);
++		if (ret)
++			break;
++
++		if (!k.k)
++			break;
++
++		if (!bkey_eq(pos, k.k->p))
++			break;
++
++		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
++			ret = 1;
++			break;
++		}
++	}
++	bch2_trans_iter_exit(trans, &iter);
++
++	return ret;
++}
++
++static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
++{
++	const struct snapshot_t *s = snapshot_t(c, id);
++
++	return s->children[1] ?: s->children[0];
++}
++
++static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
++{
++	u32 child;
++
++	while ((child = bch2_snapshot_smallest_child(c, id)))
++		id = child;
++	return id;
++}
++
++static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
++					       enum btree_id btree,
++					       struct bkey_s_c interior_k,
++					       u32 leaf_id, struct bpos *new_min_pos)
++{
++	struct btree_iter iter;
++	struct bpos pos = interior_k.k->p;
++	struct bkey_s_c k;
++	struct bkey_i *new;
++	int ret;
++
++	pos.snapshot = leaf_id;
++
++	bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
++	k = bch2_btree_iter_peek_slot(&iter);
++	ret = bkey_err(k);
++	if (ret)
++		goto out;
++
++	/* key already overwritten in this snapshot? */
++	if (k.k->p.snapshot != interior_k.k->p.snapshot)
++		goto out;
++
++	if (bpos_eq(*new_min_pos, POS_MIN)) {
++		*new_min_pos = k.k->p;
++		new_min_pos->snapshot = leaf_id;
++	}
++
++	new = bch2_bkey_make_mut_noupdate(trans, interior_k);
++	ret = PTR_ERR_OR_ZERO(new);
++	if (ret)
++		goto out;
++
++	new->k.p.snapshot = leaf_id;
++	ret = bch2_trans_update(trans, &iter, new, 0);
++out:
++	bch2_trans_iter_exit(trans, &iter);
++	return ret;
++}
++
++int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
++					  enum btree_id btree,
++					  struct bkey_s_c k,
++					  struct bpos *new_min_pos)
++{
++	struct bch_fs *c = trans->c;
++	struct bkey_buf sk;
++	int ret;
++
++	bch2_bkey_buf_init(&sk);
++	bch2_bkey_buf_reassemble(&sk, c, k);
++	k = bkey_i_to_s_c(sk.k);
++
++	*new_min_pos = POS_MIN;
++
++	for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
++	     id < k.k->p.snapshot;
++	     id++) {
++		if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
++		    !bch2_snapshot_is_leaf(c, id))
++			continue;
++
++		ret = commit_do(trans, NULL, NULL, 0,
++				bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos));
++		if (ret)
++			break;
++	}
++
++	bch2_bkey_buf_exit(&sk, c);
++	return ret;
++}
++
++int bch2_snapshots_read(struct bch_fs *c)
++{
++	struct btree_iter iter;
++	struct bkey_s_c k;
++	int ret = 0;
++
++	ret = bch2_trans_run(c,
++		for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
++			   POS_MIN, 0, k,
++			bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
++			bch2_snapshot_set_equiv(&trans, k)) ?:
++		for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
++			   POS_MIN, 0, k,
++			   (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
++	if (ret)
++		bch_err_fn(c, ret);
++	return ret;
++}
++
++void bch2_fs_snapshots_exit(struct bch_fs *c)
++{
++	kfree(rcu_dereference_protected(c->snapshots, true));
++}
+diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
+new file mode 100644
+index 000000000..dabc9b9d9
+--- /dev/null
++++ b/fs/bcachefs/snapshot.h
+@@ -0,0 +1,272 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _BCACHEFS_SNAPSHOT_H
++#define _BCACHEFS_SNAPSHOT_H
++
++enum bkey_invalid_flags;
++
++void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c,
++			       enum bkey_invalid_flags, struct printbuf *);
++
++#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {	\
++	.key_invalid	= bch2_snapshot_tree_invalid,		\
++	.val_to_text	= bch2_snapshot_tree_to_text,		\
++	.min_val_size	= 8,					\
++})
++
++struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
++
++int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
++
++void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
++int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
++			  enum bkey_invalid_flags, struct printbuf *);
++int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
++		       struct bkey_s_c, struct bkey_s_c, unsigned);
++
++#define bch2_bkey_ops_snapshot ((struct bkey_ops) {		\
++	.key_invalid	= bch2_snapshot_invalid,		\
++	.val_to_text	= bch2_snapshot_to_text,		\
++	.atomic_trigger	= bch2_mark_snapshot,			\
++	.min_val_size	= 24,					\
++})
++
++static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
++{
++	return &t->s[U32_MAX - id];
++}
++
++static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
++{
++	return __snapshot_t(rcu_dereference(c->snapshots), id);
++}
++
++static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
++{
++	rcu_read_lock();
++	id = snapshot_t(c, id)->tree;
++	rcu_read_unlock();
++
++	return id;
++}
++
++static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
++{
++	return snapshot_t(c, id)->parent;
++}
++
++static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
++{
++	rcu_read_lock();
++	id = __bch2_snapshot_parent_early(c, id);
++	rcu_read_unlock();
++
++	return id;
++}
++
++static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
++{
++#ifdef CONFIG_BCACHEFS_DEBUG
++	u32 parent = snapshot_t(c, id)->parent;
++
++	if (parent &&
++	    snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
++		panic("id %u depth=%u parent %u depth=%u\n",
++		      id, snapshot_t(c, id)->depth,
++		      parent, snapshot_t(c, parent)->depth);
++
++	return parent;
++#else
++	return snapshot_t(c, id)->parent;
++#endif
++}
++
++static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
++{
++	rcu_read_lock();
++	id = __bch2_snapshot_parent(c, id);
++	rcu_read_unlock();
++
++	return id;
++}
++
++static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
++{
++	rcu_read_lock();
++	while (n--)
++		id = __bch2_snapshot_parent(c, id);
++	rcu_read_unlock();
++
++	return id;
++}
++
++u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
++
++static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
++{
++	u32 parent;
++
++	rcu_read_lock();
++	while ((parent = __bch2_snapshot_parent(c, id)))
++		id = parent;
++	rcu_read_unlock();
++
++	return id;
++}
++
++static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
++{
++	return snapshot_t(c, id)->equiv;
++}
++
++static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
++{
++	rcu_read_lock();
++	id = __bch2_snapshot_equiv(c, id);
++	rcu_read_unlock();
++
++	return id;
++}
++
++static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
++{
++	return id == bch2_snapshot_equiv(c, id);
++}
++
++static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
++{
++	const struct snapshot_t *s;
++	bool ret;
++
++	rcu_read_lock();
++	s = snapshot_t(c, id);
++	ret = s->children[0];
++	rcu_read_unlock();
++
++	return ret;
++}
++
++static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
++{
++	return !bch2_snapshot_is_internal_node(c, id);
++}
++
++static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
++{
++	const struct snapshot_t *s;
++	u32 parent = __bch2_snapshot_parent(c, id);
++
++	if (!parent)
++		return 0;
++
++	s = snapshot_t(c, __bch2_snapshot_parent(c, id));
++	if (id == s->children[0])
++		return s->children[1];
++	if (id == s->children[1])
++		return s->children[0];
++	return 0;
++}
++
++static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
++{
++	u32 depth;
++
++	rcu_read_lock();
++	depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
++	rcu_read_unlock();
++
++	return depth;
++}
++
++bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
++
++static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
++{
++	return id == ancestor
++		? true
++		: __bch2_snapshot_is_ancestor(c, id, ancestor);
++}
++
++static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
++{
++	const struct snapshot_t *t;
++	bool ret;
++
++	rcu_read_lock();
++	t = snapshot_t(c, id);
++	ret = (t->children[0]|t->children[1]) != 0;
++	rcu_read_unlock();
++
++	return ret;
++}
++
++static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
++{
++	u32 *i;
++
++	darray_for_each(*s, i)
++		if (*i == id)
++			return true;
++	return false;
++}
++
++static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
++{
++	u32 *i;
++
++	darray_for_each(*s, i)
++		if (bch2_snapshot_is_ancestor(c, id, *i))
++			return true;
++	return false;
++}
++
++static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
++{
++	int ret;
++
++	BUG_ON(snapshot_list_has_id(s, id));
++	ret = darray_push(s, id);
++	if (ret)
++		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
++	return ret;
++}
++
++int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
++			 struct bch_snapshot *s);
++int bch2_snapshot_get_subvol(struct btree_trans *, u32,
++			     struct bch_subvolume *);
++int bch2_snapshot_live(struct btree_trans *trans, u32 id);
++int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k);
++
++/* only exported for tests: */
++int bch2_snapshot_node_create(struct btree_trans *, u32,
++			      u32 *, u32 *, unsigned);
++
++int bch2_check_snapshot_trees(struct bch_fs *);
++int bch2_check_snapshots(struct bch_fs *);
++
++int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
++int bch2_delete_dead_snapshots_hook(struct btree_trans *,
++				    struct btree_trans_commit_hook *);
++void bch2_delete_dead_snapshots_work(struct work_struct *);
++
++int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
++
++static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
++					  enum btree_id id,
++					  struct bpos pos)
++{
++	if (!btree_type_has_snapshots(id) ||
++	    bch2_snapshot_is_leaf(trans->c, pos.snapshot))
++		return 0;
++
++	return __bch2_key_has_snapshot_overwrites(trans, id, pos);
++}
++
++int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id,
++					  struct bkey_s_c, struct bpos *);
++
++int bch2_snapshots_read(struct bch_fs *);
++void bch2_fs_snapshots_exit(struct bch_fs *);
++
++#endif /* _BCACHEFS_SNAPSHOT_H */
 diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
 new file mode 100644
 index 000000000..ae21a8cca
@@ -80812,10 +85220,10 @@ index 000000000..ae21a8cca
 +#endif /* _BCACHEFS_STR_HASH_H */
 diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
 new file mode 100644
-index 000000000..811a6f428
+index 000000000..0214a98de
 --- /dev/null
 +++ b/fs/bcachefs/subvolume.c
-@@ -0,0 +1,1749 @@
+@@ -0,0 +1,451 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -80824,861 +85232,13 @@ index 000000000..811a6f428
 +#include "errcode.h"
 +#include "error.h"
 +#include "fs.h"
++#include "snapshot.h"
 +#include "subvolume.h"
 +
 +#include <linux/random.h>
 +
 +static int bch2_subvolume_delete(struct btree_trans *, u32);
 +
-+static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
-+{
-+	const struct snapshot_t *s = __snapshot_t(t, id);
-+
-+	if (s->skip[2] <= ancestor)
-+		return s->skip[2];
-+	if (s->skip[1] <= ancestor)
-+		return s->skip[1];
-+	if (s->skip[0] <= ancestor)
-+		return s->skip[0];
-+	return s->parent;
-+}
-+
-+bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-+{
-+	struct snapshot_table *t;
-+	bool ret;
-+
-+	EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
-+
-+	rcu_read_lock();
-+	t = rcu_dereference(c->snapshots);
-+
-+	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
-+		id = get_ancestor_below(t, id, ancestor);
-+
-+	ret = id && id < ancestor
-+		? test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor)
-+		: id == ancestor;
-+	rcu_read_unlock();
-+
-+	return ret;
-+}
-+
-+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
-+{
-+	struct snapshot_table *t;
-+
-+	rcu_read_lock();
-+	t = rcu_dereference(c->snapshots);
-+
-+	while (id && id < ancestor)
-+		id = __snapshot_t(t, id)->parent;
-+	rcu_read_unlock();
-+
-+	return id == ancestor;
-+}
-+
-+static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
-+{
-+	u32 depth;
-+
-+	rcu_read_lock();
-+	depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
-+	rcu_read_unlock();
-+
-+	return depth;
-+}
-+
-+static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
-+{
-+	size_t idx = U32_MAX - id;
-+	size_t new_size;
-+	struct snapshot_table *new, *old;
-+
-+	new_size = max(16UL, roundup_pow_of_two(idx + 1));
-+
-+	new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
-+	if (!new)
-+		return NULL;
-+
-+	old = c->snapshots;
-+	if (old)
-+		memcpy(new->s,
-+		       rcu_dereference_protected(c->snapshots, true)->s,
-+		       sizeof(new->s[0]) * c->snapshot_table_size);
-+
-+	rcu_assign_pointer(c->snapshots, new);
-+	c->snapshot_table_size = new_size;
-+	if (old)
-+		kvfree_rcu(old);
-+
-+	return &rcu_dereference_protected(c->snapshots, true)->s[idx];
-+}
-+
-+static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
-+{
-+	size_t idx = U32_MAX - id;
-+
-+	lockdep_assert_held(&c->snapshot_table_lock);
-+
-+	if (likely(idx < c->snapshot_table_size))
-+		return &rcu_dereference_protected(c->snapshots, true)->s[idx];
-+
-+	return __snapshot_t_mut(c, id);
-+}
-+
-+/* Snapshot tree: */
-+
-+void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
-+				struct bkey_s_c k)
-+{
-+	struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
-+
-+	prt_printf(out, "subvol %u root snapshot %u",
-+		   le32_to_cpu(t.v->master_subvol),
-+		   le32_to_cpu(t.v->root_snapshot));
-+}
-+
-+int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
-+			       enum bkey_invalid_flags flags,
-+			       struct printbuf *err)
-+{
-+	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
-+	    bkey_lt(k.k->p, POS(0, 1))) {
-+		prt_printf(err, "bad pos");
-+		return -BCH_ERR_invalid_bkey;
-+	}
-+
-+	return 0;
-+}
-+
-+int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
-+			      struct bch_snapshot_tree *s)
-+{
-+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
-+					  BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
-+
-+	if (bch2_err_matches(ret, ENOENT))
-+		ret = -BCH_ERR_ENOENT_snapshot_tree;
-+	return ret;
-+}
-+
-+static struct bkey_i_snapshot_tree *
-+__snapshot_tree_create(struct btree_trans *trans)
-+{
-+	struct btree_iter iter;
-+	int ret = bch2_bkey_get_empty_slot(trans, &iter,
-+			BTREE_ID_snapshot_trees, POS(0, U32_MAX));
-+	struct bkey_i_snapshot_tree *s_t;
-+
-+	if (ret == -BCH_ERR_ENOSPC_btree_slot)
-+		ret = -BCH_ERR_ENOSPC_snapshot_tree;
-+	if (ret)
-+		return ERR_PTR(ret);
-+
-+	s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
-+	ret = PTR_ERR_OR_ZERO(s_t);
-+	bch2_trans_iter_exit(trans, &iter);
-+	return ret ? ERR_PTR(ret) : s_t;
-+}
-+
-+static int snapshot_tree_create(struct btree_trans *trans,
-+				u32 root_id, u32 subvol_id, u32 *tree_id)
-+{
-+	struct bkey_i_snapshot_tree *n_tree =
-+		__snapshot_tree_create(trans);
-+
-+	if (IS_ERR(n_tree))
-+		return PTR_ERR(n_tree);
-+
-+	n_tree->v.master_subvol	= cpu_to_le32(subvol_id);
-+	n_tree->v.root_snapshot	= cpu_to_le32(root_id);
-+	*tree_id = n_tree->k.p.offset;
-+	return 0;
-+}
-+
-+/* Snapshot nodes: */
-+
-+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
-+			   struct bkey_s_c k)
-+{
-+	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
-+
-+	prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
-+	       BCH_SNAPSHOT_SUBVOL(s.v),
-+	       BCH_SNAPSHOT_DELETED(s.v),
-+	       le32_to_cpu(s.v->parent),
-+	       le32_to_cpu(s.v->children[0]),
-+	       le32_to_cpu(s.v->children[1]),
-+	       le32_to_cpu(s.v->subvol),
-+	       le32_to_cpu(s.v->tree));
-+
-+	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
-+		prt_printf(out, " depth %u skiplist %u %u %u",
-+			   le32_to_cpu(s.v->depth),
-+			   le32_to_cpu(s.v->skip[0]),
-+			   le32_to_cpu(s.v->skip[1]),
-+			   le32_to_cpu(s.v->skip[2]));
-+}
-+
-+int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
-+			  enum bkey_invalid_flags flags,
-+			  struct printbuf *err)
-+{
-+	struct bkey_s_c_snapshot s;
-+	u32 i, id;
-+
-+	if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
-+	    bkey_lt(k.k->p, POS(0, 1))) {
-+		prt_printf(err, "bad pos");
-+		return -BCH_ERR_invalid_bkey;
-+	}
-+
-+	s = bkey_s_c_to_snapshot(k);
-+
-+	id = le32_to_cpu(s.v->parent);
-+	if (id && id <= k.k->p.offset) {
-+		prt_printf(err, "bad parent node (%u <= %llu)",
-+		       id, k.k->p.offset);
-+		return -BCH_ERR_invalid_bkey;
-+	}
-+
-+	if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
-+		prt_printf(err, "children not normalized");
-+		return -BCH_ERR_invalid_bkey;
-+	}
-+
-+	if (s.v->children[0] &&
-+	    s.v->children[0] == s.v->children[1]) {
-+		prt_printf(err, "duplicate child nodes");
-+		return -BCH_ERR_invalid_bkey;
-+	}
-+
-+	for (i = 0; i < 2; i++) {
-+		id = le32_to_cpu(s.v->children[i]);
-+
-+		if (id >= k.k->p.offset) {
-+			prt_printf(err, "bad child node (%u >= %llu)",
-+			       id, k.k->p.offset);
-+			return -BCH_ERR_invalid_bkey;
-+		}
-+	}
-+
-+	if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
-+		if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
-+		    le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
-+			prt_printf(err, "skiplist not normalized");
-+			return -BCH_ERR_invalid_bkey;
-+		}
-+
-+		for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
-+			id = le32_to_cpu(s.v->skip[i]);
-+
-+			if (!id != !s.v->parent ||
-+			    (s.v->parent &&
-+			     id <= k.k->p.offset)) {
-+				prt_printf(err, "bad skiplist node %u)", id);
-+				return -BCH_ERR_invalid_bkey;
-+			}
-+		}
-+	}
-+
-+	return 0;
-+}
-+
-+int bch2_mark_snapshot(struct btree_trans *trans,
-+		       enum btree_id btree, unsigned level,
-+		       struct bkey_s_c old, struct bkey_s_c new,
-+		       unsigned flags)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct snapshot_t *t;
-+	u32 id = new.k->p.offset;
-+	int ret = 0;
-+
-+	mutex_lock(&c->snapshot_table_lock);
-+
-+	t = snapshot_t_mut(c, id);
-+	if (!t) {
-+		ret = -BCH_ERR_ENOMEM_mark_snapshot;
-+		goto err;
-+	}
-+
-+	if (new.k->type == KEY_TYPE_snapshot) {
-+		struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
-+		u32 parent = id;
-+
-+		t->parent	= le32_to_cpu(s.v->parent);
-+		t->children[0]	= le32_to_cpu(s.v->children[0]);
-+		t->children[1]	= le32_to_cpu(s.v->children[1]);
-+		t->subvol	= BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
-+		t->tree		= le32_to_cpu(s.v->tree);
-+
-+		if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
-+			t->depth	= le32_to_cpu(s.v->depth);
-+			t->skip[0]	= le32_to_cpu(s.v->skip[0]);
-+			t->skip[1]	= le32_to_cpu(s.v->skip[1]);
-+			t->skip[2]	= le32_to_cpu(s.v->skip[2]);
-+		} else {
-+			t->depth	= 0;
-+			t->skip[0]	= 0;
-+			t->skip[1]	= 0;
-+			t->skip[2]	= 0;
-+		}
-+
-+		while ((parent = bch2_snapshot_parent_early(c, parent)) &&
-+		       parent - id - 1 < IS_ANCESTOR_BITMAP)
-+			__set_bit(parent - id - 1, t->is_ancestor);
-+
-+		if (BCH_SNAPSHOT_DELETED(s.v)) {
-+			set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-+			c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots);
-+		}
-+	} else {
-+		memset(t, 0, sizeof(*t));
-+	}
-+err:
-+	mutex_unlock(&c->snapshot_table_lock);
-+	return ret;
-+}
-+
-+static int snapshot_lookup(struct btree_trans *trans, u32 id,
-+			   struct bch_snapshot *s)
-+{
-+	return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
-+				       BTREE_ITER_WITH_UPDATES, snapshot, s);
-+}
-+
-+static int snapshot_live(struct btree_trans *trans, u32 id)
-+{
-+	struct bch_snapshot v;
-+	int ret;
-+
-+	if (!id)
-+		return 0;
-+
-+	ret = snapshot_lookup(trans, id, &v);
-+	if (bch2_err_matches(ret, ENOENT))
-+		bch_err(trans->c, "snapshot node %u not found", id);
-+	if (ret)
-+		return ret;
-+
-+	return !BCH_SNAPSHOT_DELETED(&v);
-+}
-+
-+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
-+{
-+	struct bch_fs *c = trans->c;
-+	unsigned i, nr_live = 0, live_idx = 0;
-+	struct bkey_s_c_snapshot snap;
-+	u32 id = k.k->p.offset, child[2];
-+
-+	if (k.k->type != KEY_TYPE_snapshot)
-+		return 0;
-+
-+	snap = bkey_s_c_to_snapshot(k);
-+
-+	child[0] = le32_to_cpu(snap.v->children[0]);
-+	child[1] = le32_to_cpu(snap.v->children[1]);
-+
-+	for (i = 0; i < 2; i++) {
-+		int ret = snapshot_live(trans, child[i]);
-+
-+		if (ret < 0)
-+			return ret;
-+
-+		if (ret)
-+			live_idx = i;
-+		nr_live += ret;
-+	}
-+
-+	mutex_lock(&c->snapshot_table_lock);
-+
-+	snapshot_t_mut(c, id)->equiv = nr_live == 1
-+		? snapshot_t_mut(c, child[live_idx])->equiv
-+		: id;
-+
-+	mutex_unlock(&c->snapshot_table_lock);
-+
-+	return 0;
-+}
-+
-+/* fsck: */
-+
-+static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
-+{
-+	return snapshot_t(c, id)->children[child];
-+}
-+
-+static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
-+{
-+	return bch2_snapshot_child(c, id, 0);
-+}
-+
-+static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
-+{
-+	return bch2_snapshot_child(c, id, 1);
-+}
-+
-+static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
-+{
-+	u32 n, parent;
-+
-+	n = bch2_snapshot_left_child(c, id);
-+	if (n)
-+		return n;
-+
-+	while ((parent = bch2_snapshot_parent(c, id))) {
-+		n = bch2_snapshot_right_child(c, parent);
-+		if (n && n != id)
-+			return n;
-+		id = parent;
-+	}
-+
-+	return 0;
-+}
-+
-+static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
-+{
-+	u32 id = snapshot_root;
-+	u32 subvol = 0, s;
-+
-+	while (id) {
-+		s = snapshot_t(c, id)->subvol;
-+
-+		if (s && (!subvol || s < subvol))
-+			subvol = s;
-+
-+		id = bch2_snapshot_tree_next(c, id);
-+	}
-+
-+	return subvol;
-+}
-+
-+static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
-+					    u32 snapshot_root, u32 *subvol_id)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	struct bkey_s_c_subvolume s;
-+	bool found = false;
-+	int ret;
-+
-+	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
-+				     0, k, ret) {
-+		if (k.k->type != KEY_TYPE_subvolume)
-+			continue;
-+
-+		s = bkey_s_c_to_subvolume(k);
-+		if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
-+			continue;
-+		if (!BCH_SUBVOLUME_SNAP(s.v)) {
-+			*subvol_id = s.k->p.offset;
-+			found = true;
-+			break;
-+		}
-+	}
-+
-+	bch2_trans_iter_exit(trans, &iter);
-+
-+	if (!ret && !found) {
-+		struct bkey_i_subvolume *s;
-+
-+		*subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
-+
-+		s = bch2_bkey_get_mut_typed(trans, &iter,
-+					    BTREE_ID_subvolumes, POS(0, *subvol_id),
-+					    0, subvolume);
-+		ret = PTR_ERR_OR_ZERO(s);
-+		if (ret)
-+			return ret;
-+
-+		SET_BCH_SUBVOLUME_SNAP(&s->v, false);
-+	}
-+
-+	return ret;
-+}
-+
-+static int check_snapshot_tree(struct btree_trans *trans,
-+			       struct btree_iter *iter,
-+			       struct bkey_s_c k)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct bkey_s_c_snapshot_tree st;
-+	struct bch_snapshot s;
-+	struct bch_subvolume subvol;
-+	struct printbuf buf = PRINTBUF;
-+	u32 root_id;
-+	int ret;
-+
-+	if (k.k->type != KEY_TYPE_snapshot_tree)
-+		return 0;
-+
-+	st = bkey_s_c_to_snapshot_tree(k);
-+	root_id = le32_to_cpu(st.v->root_snapshot);
-+
-+	ret = snapshot_lookup(trans, root_id, &s);
-+	if (ret && !bch2_err_matches(ret, ENOENT))
-+		goto err;
-+
-+	if (fsck_err_on(ret ||
-+			root_id != bch2_snapshot_root(c, root_id) ||
-+			st.k->p.offset != le32_to_cpu(s.tree),
-+			c,
-+			"snapshot tree points to missing/incorrect snapshot:\n  %s",
-+			(bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
-+		ret = bch2_btree_delete_at(trans, iter, 0);
-+		goto err;
-+	}
-+
-+	ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
-+				 false, 0, &subvol);
-+	if (ret && !bch2_err_matches(ret, ENOENT))
-+		goto err;
-+
-+	if (fsck_err_on(ret, c,
-+			"snapshot tree points to missing subvolume:\n  %s",
-+			(printbuf_reset(&buf),
-+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-+	    fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
-+						le32_to_cpu(subvol.snapshot),
-+						root_id), c,
-+			"snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
-+			(printbuf_reset(&buf),
-+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-+	    fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
-+			"snapshot tree points to snapshot subvolume:\n  %s",
-+			(printbuf_reset(&buf),
-+			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
-+		struct bkey_i_snapshot_tree *u;
-+		u32 subvol_id;
-+
-+		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
-+		if (ret)
-+			goto err;
-+
-+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
-+		ret = PTR_ERR_OR_ZERO(u);
-+		if (ret)
-+			goto err;
-+
-+		u->v.master_subvol = cpu_to_le32(subvol_id);
-+		st = snapshot_tree_i_to_s_c(u);
-+	}
-+err:
-+fsck_err:
-+	printbuf_exit(&buf);
-+	return ret;
-+}
-+
-+/*
-+ * For each snapshot_tree, make sure it points to the root of a snapshot tree
-+ * and that snapshot entry points back to it, or delete it.
-+ *
-+ * And, make sure it points to a subvolume within that snapshot tree, or correct
-+ * it to point to the oldest subvolume within that snapshot tree.
-+ */
-+int bch2_check_snapshot_trees(struct bch_fs *c)
-+{
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	int ret;
-+
-+	ret = bch2_trans_run(c,
-+		for_each_btree_key_commit(&trans, iter,
-+			BTREE_ID_snapshot_trees, POS_MIN,
-+			BTREE_ITER_PREFETCH, k,
-+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-+		check_snapshot_tree(&trans, &iter, k)));
-+
-+	if (ret)
-+		bch_err(c, "error %i checking snapshot trees", ret);
-+	return ret;
-+}
-+
-+/*
-+ * Look up snapshot tree for @tree_id and find root,
-+ * make sure @snap_id is a descendent:
-+ */
-+static int snapshot_tree_ptr_good(struct btree_trans *trans,
-+				  u32 snap_id, u32 tree_id)
-+{
-+	struct bch_snapshot_tree s_t;
-+	int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
-+
-+	if (bch2_err_matches(ret, ENOENT))
-+		return 0;
-+	if (ret)
-+		return ret;
-+
-+	return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
-+}
-+
-+static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id)
-+{
-+	const struct snapshot_t *s;
-+
-+	if (!id)
-+		return 0;
-+
-+	rcu_read_lock();
-+	s = snapshot_t(c, id);
-+	if (s->parent)
-+		id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
-+	rcu_read_unlock();
-+
-+	return id;
-+}
-+
-+static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s)
-+{
-+	struct bch_snapshot a;
-+	unsigned i;
-+	int ret;
-+
-+	for (i = 0; i < 3; i++) {
-+		if (!s.parent != !s.skip[i])
-+			return false;
-+
-+		if (!s.parent)
-+			continue;
-+
-+		ret = snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a);
-+		if (bch2_err_matches(ret, ENOENT))
-+			return false;
-+		if (ret)
-+			return ret;
-+
-+		if (a.tree != s.tree)
-+			return false;
-+	}
-+
-+	return true;
-+}
-+
-+/*
-+ * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
-+ * its snapshot_tree pointer is correct (allocate new one if necessary), then
-+ * update this node's pointer to root node's pointer:
-+ */
-+static int snapshot_tree_ptr_repair(struct btree_trans *trans,
-+				    struct btree_iter *iter,
-+				    struct bkey_s_c k,
-+				    struct bch_snapshot *s)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_iter root_iter;
-+	struct bch_snapshot_tree s_t;
-+	struct bkey_s_c_snapshot root;
-+	struct bkey_i_snapshot *u;
-+	u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
-+	int ret;
-+
-+	root = bch2_bkey_get_iter_typed(trans, &root_iter,
-+			       BTREE_ID_snapshots, POS(0, root_id),
-+			       BTREE_ITER_WITH_UPDATES, snapshot);
-+	ret = bkey_err(root);
-+	if (ret)
-+		goto err;
-+
-+	tree_id = le32_to_cpu(root.v->tree);
-+
-+	ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
-+	if (ret && !bch2_err_matches(ret, ENOENT))
-+		return ret;
-+
-+	if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
-+		u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
-+		ret =   PTR_ERR_OR_ZERO(u) ?:
-+			snapshot_tree_create(trans, root_id,
-+				bch2_snapshot_tree_oldest_subvol(c, root_id),
-+				&tree_id);
-+		if (ret)
-+			goto err;
-+
-+		u->v.tree = cpu_to_le32(tree_id);
-+		if (k.k->p.offset == root_id)
-+			*s = u->v;
-+	}
-+
-+	if (k.k->p.offset != root_id) {
-+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-+		ret = PTR_ERR_OR_ZERO(u);
-+		if (ret)
-+			goto err;
-+
-+		u->v.tree = cpu_to_le32(tree_id);
-+		*s = u->v;
-+	}
-+err:
-+	bch2_trans_iter_exit(trans, &root_iter);
-+	return ret;
-+}
-+
-+static int check_snapshot(struct btree_trans *trans,
-+			  struct btree_iter *iter,
-+			  struct bkey_s_c k)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct bch_snapshot s;
-+	struct bch_subvolume subvol;
-+	struct bch_snapshot v;
-+	struct bkey_i_snapshot *u;
-+	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
-+	u32 real_depth;
-+	struct printbuf buf = PRINTBUF;
-+	bool should_have_subvol;
-+	u32 i, id;
-+	int ret = 0;
-+
-+	if (k.k->type != KEY_TYPE_snapshot)
-+		return 0;
-+
-+	memset(&s, 0, sizeof(s));
-+	memcpy(&s, k.v, bkey_val_bytes(k.k));
-+
-+	id = le32_to_cpu(s.parent);
-+	if (id) {
-+		ret = snapshot_lookup(trans, id, &v);
-+		if (bch2_err_matches(ret, ENOENT))
-+			bch_err(c, "snapshot with nonexistent parent:\n  %s",
-+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-+		if (ret)
-+			goto err;
-+
-+		if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
-+		    le32_to_cpu(v.children[1]) != k.k->p.offset) {
-+			bch_err(c, "snapshot parent %u missing pointer to child %llu",
-+				id, k.k->p.offset);
-+			ret = -EINVAL;
-+			goto err;
-+		}
-+	}
-+
-+	for (i = 0; i < 2 && s.children[i]; i++) {
-+		id = le32_to_cpu(s.children[i]);
-+
-+		ret = snapshot_lookup(trans, id, &v);
-+		if (bch2_err_matches(ret, ENOENT))
-+			bch_err(c, "snapshot node %llu has nonexistent child %u",
-+				k.k->p.offset, id);
-+		if (ret)
-+			goto err;
-+
-+		if (le32_to_cpu(v.parent) != k.k->p.offset) {
-+			bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
-+				id, le32_to_cpu(v.parent), k.k->p.offset);
-+			ret = -EINVAL;
-+			goto err;
-+		}
-+	}
-+
-+	should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
-+		!BCH_SNAPSHOT_DELETED(&s);
-+
-+	if (should_have_subvol) {
-+		id = le32_to_cpu(s.subvol);
-+		ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
-+		if (bch2_err_matches(ret, ENOENT))
-+			bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
-+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-+		if (ret)
-+			goto err;
-+
-+		if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
-+			bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
-+				k.k->p.offset);
-+			ret = -EINVAL;
-+			goto err;
-+		}
-+	} else {
-+		if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n  %s",
-+				(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-+			u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-+			ret = PTR_ERR_OR_ZERO(u);
-+			if (ret)
-+				goto err;
-+
-+			u->v.subvol = 0;
-+			s = u->v;
-+		}
-+	}
-+
-+	ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
-+	if (ret < 0)
-+		goto err;
-+
-+	if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n  %s",
-+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-+		ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
-+		if (ret)
-+			goto err;
-+	}
-+	ret = 0;
-+
-+	real_depth = bch2_snapshot_depth(c, parent_id);
-+
-+	if (le32_to_cpu(s.depth) != real_depth &&
-+	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
-+	     fsck_err(c, "snapshot with incorrect depth field, should be %u:\n  %s",
-+		      real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
-+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-+		ret = PTR_ERR_OR_ZERO(u);
-+		if (ret)
-+			goto err;
-+
-+		u->v.depth = cpu_to_le32(real_depth);
-+		s = u->v;
-+	}
-+
-+	ret = snapshot_skiplist_good(trans, s);
-+	if (ret < 0)
-+		goto err;
-+
-+	if (!ret &&
-+	    (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
-+	     fsck_err(c, "snapshot with bad skiplist field:\n  %s",
-+		      (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
-+		u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-+		ret = PTR_ERR_OR_ZERO(u);
-+		if (ret)
-+			goto err;
-+
-+		for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
-+			u->v.skip[i] = cpu_to_le32(snapshot_skiplist_get(c, parent_id));
-+
-+		bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_int);
-+		s = u->v;
-+	}
-+	ret = 0;
-+err:
-+fsck_err:
-+	printbuf_exit(&buf);
-+	return ret;
-+}
-+
-+int bch2_check_snapshots(struct bch_fs *c)
-+{
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	int ret;
-+
-+	/*
-+	 * We iterate backwards as checking/fixing the depth field requires that
-+	 * the parent's depth already be correct:
-+	 */
-+	ret = bch2_trans_run(c,
-+		for_each_btree_key_reverse_commit(&trans, iter,
-+			BTREE_ID_snapshots, POS_MAX,
-+			BTREE_ITER_PREFETCH, k,
-+			NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-+		check_snapshot(&trans, &iter, k)));
-+	if (ret)
-+		bch_err_fn(c, ret);
-+	return ret;
-+}
-+
 +static int check_subvol(struct btree_trans *trans,
 +			struct btree_iter *iter,
 +			struct bkey_s_c k)
@@ -81694,7 +85254,7 @@ index 000000000..811a6f428
 +
 +	subvol = bkey_s_c_to_subvolume(k);
 +	snapid = le32_to_cpu(subvol.v->snapshot);
-+	ret = snapshot_lookup(trans, snapid, &snapshot);
++	ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
 +
 +	if (bch2_err_matches(ret, ENOENT))
 +		bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
@@ -81762,462 +85322,6 @@ index 000000000..811a6f428
 +	return ret;
 +}
 +
-+void bch2_fs_snapshots_exit(struct bch_fs *c)
-+{
-+	kfree(c->snapshots);
-+}
-+
-+int bch2_snapshots_read(struct bch_fs *c)
-+{
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	int ret = 0;
-+
-+	ret = bch2_trans_run(c,
-+		for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
-+			   POS_MIN, 0, k,
-+			bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
-+			bch2_snapshot_set_equiv(&trans, k)));
-+	if (ret)
-+		bch_err_fn(c, ret);
-+	return ret;
-+}
-+
-+/*
-+ * Mark a snapshot as deleted, for future cleanup:
-+ */
-+static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
-+{
-+	struct btree_iter iter;
-+	struct bkey_i_snapshot *s;
-+	int ret = 0;
-+
-+	s = bch2_bkey_get_mut_typed(trans, &iter,
-+				    BTREE_ID_snapshots, POS(0, id),
-+				    0, snapshot);
-+	ret = PTR_ERR_OR_ZERO(s);
-+	if (unlikely(ret)) {
-+		bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
-+					trans->c, "missing snapshot %u", id);
-+		return ret;
-+	}
-+
-+	/* already deleted? */
-+	if (BCH_SNAPSHOT_DELETED(&s->v))
-+		goto err;
-+
-+	SET_BCH_SNAPSHOT_DELETED(&s->v, true);
-+	SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
-+	s->v.subvol = 0;
-+err:
-+	bch2_trans_iter_exit(trans, &iter);
-+	return ret;
-+}
-+
-+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
-+	struct btree_iter tree_iter = (struct btree_iter) { NULL };
-+	struct bkey_s_c_snapshot s;
-+	u32 parent_id;
-+	unsigned i;
-+	int ret = 0;
-+
-+	s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
-+				     BTREE_ITER_INTENT, snapshot);
-+	ret = bkey_err(s);
-+	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-+				"missing snapshot %u", id);
-+
-+	if (ret)
-+		goto err;
-+
-+	BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
-+	parent_id = le32_to_cpu(s.v->parent);
-+
-+	if (parent_id) {
-+		struct bkey_i_snapshot *parent;
-+
-+		parent = bch2_bkey_get_mut_typed(trans, &p_iter,
-+				     BTREE_ID_snapshots, POS(0, parent_id),
-+				     0, snapshot);
-+		ret = PTR_ERR_OR_ZERO(parent);
-+		if (unlikely(ret)) {
-+			bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-+						"missing snapshot %u", parent_id);
-+			goto err;
-+		}
-+
-+		for (i = 0; i < 2; i++)
-+			if (le32_to_cpu(parent->v.children[i]) == id)
-+				break;
-+
-+		if (i == 2)
-+			bch_err(c, "snapshot %u missing child pointer to %u",
-+				parent_id, id);
-+		else
-+			parent->v.children[i] = 0;
-+
-+		if (le32_to_cpu(parent->v.children[0]) <
-+		    le32_to_cpu(parent->v.children[1]))
-+			swap(parent->v.children[0],
-+			     parent->v.children[1]);
-+	} else {
-+		/*
-+		 * We're deleting the root of a snapshot tree: update the
-+		 * snapshot_tree entry to point to the new root, or delete it if
-+		 * this is the last snapshot ID in this tree:
-+		 */
-+		struct bkey_i_snapshot_tree *s_t;
-+
-+		BUG_ON(s.v->children[1]);
-+
-+		s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
-+				BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
-+				0, snapshot_tree);
-+		ret = PTR_ERR_OR_ZERO(s_t);
-+		if (ret)
-+			goto err;
-+
-+		if (s.v->children[0]) {
-+			s_t->v.root_snapshot = s.v->children[0];
-+		} else {
-+			s_t->k.type = KEY_TYPE_deleted;
-+			set_bkey_val_u64s(&s_t->k, 0);
-+		}
-+	}
-+
-+	ret = bch2_btree_delete_at(trans, &iter, 0);
-+err:
-+	bch2_trans_iter_exit(trans, &tree_iter);
-+	bch2_trans_iter_exit(trans, &p_iter);
-+	bch2_trans_iter_exit(trans, &iter);
-+	return ret;
-+}
-+
-+static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
-+			  u32 *new_snapids,
-+			  u32 *snapshot_subvols,
-+			  unsigned nr_snapids)
-+{
-+	struct bch_fs *c = trans->c;
-+	struct btree_iter iter;
-+	struct bkey_i_snapshot *n;
-+	struct bkey_s_c k;
-+	unsigned i, j;
-+	u32 depth = bch2_snapshot_depth(c, parent);
-+	int ret;
-+
-+	bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
-+			     POS_MIN, BTREE_ITER_INTENT);
-+	k = bch2_btree_iter_peek(&iter);
-+	ret = bkey_err(k);
-+	if (ret)
-+		goto err;
-+
-+	for (i = 0; i < nr_snapids; i++) {
-+		k = bch2_btree_iter_prev_slot(&iter);
-+		ret = bkey_err(k);
-+		if (ret)
-+			goto err;
-+
-+		if (!k.k || !k.k->p.offset) {
-+			ret = -BCH_ERR_ENOSPC_snapshot_create;
-+			goto err;
-+		}
-+
-+		n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
-+		ret = PTR_ERR_OR_ZERO(n);
-+		if (ret)
-+			goto err;
-+
-+		n->v.flags	= 0;
-+		n->v.parent	= cpu_to_le32(parent);
-+		n->v.subvol	= cpu_to_le32(snapshot_subvols[i]);
-+		n->v.tree	= cpu_to_le32(tree);
-+		n->v.depth	= cpu_to_le32(depth);
-+
-+		for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
-+			n->v.skip[j] = cpu_to_le32(snapshot_skiplist_get(c, parent));
-+
-+		bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_int);
-+		SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
-+
-+		ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
-+					 bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
-+		if (ret)
-+			goto err;
-+
-+		new_snapids[i]	= iter.pos.offset;
-+	}
-+err:
-+	bch2_trans_iter_exit(trans, &iter);
-+	return ret;
-+}
-+
-+/*
-+ * Create new snapshot IDs as children of an existing snapshot ID:
-+ */
-+static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
-+			      u32 *new_snapids,
-+			      u32 *snapshot_subvols,
-+			      unsigned nr_snapids)
-+{
-+	struct btree_iter iter;
-+	struct bkey_i_snapshot *n_parent;
-+	int ret = 0;
-+
-+	n_parent = bch2_bkey_get_mut_typed(trans, &iter,
-+			BTREE_ID_snapshots, POS(0, parent),
-+			0, snapshot);
-+	ret = PTR_ERR_OR_ZERO(n_parent);
-+	if (unlikely(ret)) {
-+		if (bch2_err_matches(ret, ENOENT))
-+			bch_err(trans->c, "snapshot %u not found", parent);
-+		return ret;
-+	}
-+
-+	if (n_parent->v.children[0] || n_parent->v.children[1]) {
-+		bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
-+		ret = -EINVAL;
-+		goto err;
-+	}
-+
-+	ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
-+			     new_snapids, snapshot_subvols, nr_snapids);
-+	if (ret)
-+		goto err;
-+
-+	n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
-+	n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
-+	n_parent->v.subvol = 0;
-+	SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
-+err:
-+	bch2_trans_iter_exit(trans, &iter);
-+	return ret;
-+}
-+
-+/*
-+ * Create a snapshot node that is the root of a new tree:
-+ */
-+static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
-+			      u32 *new_snapids,
-+			      u32 *snapshot_subvols,
-+			      unsigned nr_snapids)
-+{
-+	struct bkey_i_snapshot_tree *n_tree;
-+	int ret;
-+
-+	n_tree = __snapshot_tree_create(trans);
-+	ret =   PTR_ERR_OR_ZERO(n_tree) ?:
-+		create_snapids(trans, 0, n_tree->k.p.offset,
-+			     new_snapids, snapshot_subvols, nr_snapids);
-+	if (ret)
-+		return ret;
-+
-+	n_tree->v.master_subvol	= cpu_to_le32(snapshot_subvols[0]);
-+	n_tree->v.root_snapshot	= cpu_to_le32(new_snapids[0]);
-+	return 0;
-+}
-+
-+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
-+			      u32 *new_snapids,
-+			      u32 *snapshot_subvols,
-+			      unsigned nr_snapids)
-+{
-+	BUG_ON((parent == 0) != (nr_snapids == 1));
-+	BUG_ON((parent != 0) != (nr_snapids == 2));
-+
-+	return parent
-+		? bch2_snapshot_node_create_children(trans, parent,
-+				new_snapids, snapshot_subvols, nr_snapids)
-+		: bch2_snapshot_node_create_tree(trans,
-+				new_snapids, snapshot_subvols, nr_snapids);
-+
-+}
-+
-+static int snapshot_delete_key(struct btree_trans *trans,
-+			       struct btree_iter *iter,
-+			       struct bkey_s_c k,
-+			       snapshot_id_list *deleted,
-+			       snapshot_id_list *equiv_seen,
-+			       struct bpos *last_pos)
-+{
-+	struct bch_fs *c = trans->c;
-+	u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
-+
-+	if (!bkey_eq(k.k->p, *last_pos))
-+		equiv_seen->nr = 0;
-+	*last_pos = k.k->p;
-+
-+	if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
-+	    snapshot_list_has_id(equiv_seen, equiv)) {
-+		return bch2_btree_delete_at(trans, iter,
-+					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-+	} else {
-+		return snapshot_list_add(c, equiv_seen, equiv);
-+	}
-+}
-+
-+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
-+					  struct bkey_s_c k)
-+{
-+	struct bkey_s_c_snapshot snap;
-+	u32 children[2];
-+	int ret;
-+
-+	if (k.k->type != KEY_TYPE_snapshot)
-+		return 0;
-+
-+	snap = bkey_s_c_to_snapshot(k);
-+	if (BCH_SNAPSHOT_DELETED(snap.v) ||
-+	    BCH_SNAPSHOT_SUBVOL(snap.v))
-+		return 0;
-+
-+	children[0] = le32_to_cpu(snap.v->children[0]);
-+	children[1] = le32_to_cpu(snap.v->children[1]);
-+
-+	ret   = snapshot_live(trans, children[0]) ?:
-+		snapshot_live(trans, children[1]);
-+	if (ret < 0)
-+		return ret;
-+
-+	if (!ret)
-+		return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
-+	return 0;
-+}
-+
-+int bch2_delete_dead_snapshots(struct bch_fs *c)
-+{
-+	struct btree_trans trans;
-+	struct btree_iter iter;
-+	struct bkey_s_c k;
-+	struct bkey_s_c_snapshot snap;
-+	snapshot_id_list deleted = { 0 };
-+	u32 i, id;
-+	int ret = 0;
-+
-+	if (!test_bit(BCH_FS_STARTED, &c->flags)) {
-+		ret = bch2_fs_read_write_early(c);
-+		if (ret) {
-+			bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
-+			return ret;
-+		}
-+	}
-+
-+	bch2_trans_init(&trans, c, 0, 0);
-+
-+	/*
-+	 * For every snapshot node: If we have no live children and it's not
-+	 * pointed to by a subvolume, delete it:
-+	 */
-+	ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
-+			POS_MIN, 0, k,
-+			NULL, NULL, 0,
-+		bch2_delete_redundant_snapshot(&trans, &iter, k));
-+	if (ret) {
-+		bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
-+		goto err;
-+	}
-+
-+	for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
-+			   POS_MIN, 0, k,
-+		bch2_snapshot_set_equiv(&trans, k));
-+	if (ret) {
-+		bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
-+		goto err;
-+	}
-+
-+	for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-+			   POS_MIN, 0, k, ret) {
-+		if (k.k->type != KEY_TYPE_snapshot)
-+			continue;
-+
-+		snap = bkey_s_c_to_snapshot(k);
-+		if (BCH_SNAPSHOT_DELETED(snap.v)) {
-+			ret = snapshot_list_add(c, &deleted, k.k->p.offset);
-+			if (ret)
-+				break;
-+		}
-+	}
-+	bch2_trans_iter_exit(&trans, &iter);
-+
-+	if (ret) {
-+		bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
-+		goto err;
-+	}
-+
-+	for (id = 0; id < BTREE_ID_NR; id++) {
-+		struct bpos last_pos = POS_MIN;
-+		snapshot_id_list equiv_seen = { 0 };
-+
-+		if (!btree_type_has_snapshots(id))
-+			continue;
-+
-+		ret = for_each_btree_key_commit(&trans, iter,
-+				id, POS_MIN,
-+				BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-+				NULL, NULL, BTREE_INSERT_NOFAIL,
-+			snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
-+
-+		darray_exit(&equiv_seen);
-+
-+		if (ret) {
-+			bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
-+			goto err;
-+		}
-+	}
-+
-+	for (i = 0; i < deleted.nr; i++) {
-+		ret = commit_do(&trans, NULL, NULL, 0,
-+			bch2_snapshot_node_delete(&trans, deleted.data[i]));
-+		if (ret) {
-+			bch_err(c, "error deleting snapshot %u: %s",
-+				deleted.data[i], bch2_err_str(ret));
-+			goto err;
-+		}
-+	}
-+
-+	clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-+err:
-+	darray_exit(&deleted);
-+	bch2_trans_exit(&trans);
-+	if (ret)
-+		bch_err_fn(c, ret);
-+	return ret;
-+}
-+
-+static void bch2_delete_dead_snapshots_work(struct work_struct *work)
-+{
-+	struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
-+
-+	if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
-+		bch2_delete_dead_snapshots(c);
-+	bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
-+}
-+
-+void bch2_delete_dead_snapshots_async(struct bch_fs *c)
-+{
-+	if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
-+	    !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
-+		bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
-+}
-+
-+static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
-+					   struct btree_trans_commit_hook *h)
-+{
-+	struct bch_fs *c = trans->c;
-+
-+	set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-+
-+	if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
-+		return 0;
-+
-+	bch2_delete_dead_snapshots_async(c);
-+	return 0;
-+}
-+
 +/* Subvolumes: */
 +
 +int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
@@ -82272,26 +85376,27 @@ index 000000000..811a6f428
 +{
 +	struct bch_snapshot snap;
 +
-+	return  snapshot_lookup(trans, snapshot, &snap) ?:
++	return  bch2_snapshot_lookup(trans, snapshot, &snap) ?:
 +		bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
 +}
 +
-+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
++int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
 +				u32 *snapid)
 +{
 +	struct btree_iter iter;
-+	struct bkey_s_c k;
++	struct bkey_s_c_subvolume subvol;
 +	int ret;
 +
-+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
-+			       BTREE_ITER_CACHED|
-+			       BTREE_ITER_WITH_UPDATES);
-+	ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -BCH_ERR_ENOENT_subvolume;
++	subvol = bch2_bkey_get_iter_typed(trans, &iter,
++					  BTREE_ID_subvolumes, POS(0, subvolid),
++					  BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES,
++					  subvolume);
++	ret = bkey_err(subvol);
++	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
++				"missing subvolume %u", subvolid);
 +
 +	if (likely(!ret))
-+		*snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
-+	else if (bch2_err_matches(ret, ENOENT))
-+		bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
++		*snapid = le32_to_cpu(subvol.v->snapshot);
 +	bch2_trans_iter_exit(trans, &iter);
 +	return ret;
 +}
@@ -82321,7 +85426,12 @@ index 000000000..811a6f428
 +}
 +
 +/*
-+ * Scan for subvolumes with parent @subvolid_to_delete, reparent:
++ * Separate from the snapshot tree in the snapshots btree, we record the tree
++ * structure of how snapshot subvolumes were created - the parent subvolume of
++ * each snapshot subvolume.
++ *
++ * When a subvolume is deleted, we scan for child subvolumes and reparant them,
++ * to avoid dangling references:
 + */
 +static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
 +{
@@ -82567,10 +85677,10 @@ index 000000000..811a6f428
 +}
 diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
 new file mode 100644
-index 000000000..6905e91a9
+index 000000000..8d4c50f4c
 --- /dev/null
 +++ b/fs/bcachefs/subvolume.h
-@@ -0,0 +1,258 @@
+@@ -0,0 +1,35 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_SUBVOLUME_H
 +#define _BCACHEFS_SUBVOLUME_H
@@ -82580,225 +85690,8 @@ index 000000000..6905e91a9
 +
 +enum bkey_invalid_flags;
 +
-+void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-+int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c,
-+			       enum bkey_invalid_flags, struct printbuf *);
-+
-+#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {	\
-+	.key_invalid	= bch2_snapshot_tree_invalid,		\
-+	.val_to_text	= bch2_snapshot_tree_to_text,		\
-+	.min_val_size	= 8,					\
-+})
-+
-+int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
-+
-+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-+int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
-+			  enum bkey_invalid_flags, struct printbuf *);
-+int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
-+		       struct bkey_s_c, struct bkey_s_c, unsigned);
-+
-+#define bch2_bkey_ops_snapshot ((struct bkey_ops) {		\
-+	.key_invalid	= bch2_snapshot_invalid,		\
-+	.val_to_text	= bch2_snapshot_to_text,		\
-+	.atomic_trigger	= bch2_mark_snapshot,			\
-+	.min_val_size	= 24,					\
-+})
-+
-+static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
-+{
-+	return &t->s[U32_MAX - id];
-+}
-+
-+static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
-+{
-+	return __snapshot_t(rcu_dereference(c->snapshots), id);
-+}
-+
-+static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
-+{
-+	rcu_read_lock();
-+	id = snapshot_t(c, id)->tree;
-+	rcu_read_unlock();
-+
-+	return id;
-+}
-+
-+static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-+{
-+	return snapshot_t(c, id)->parent;
-+}
-+
-+static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-+{
-+	rcu_read_lock();
-+	id = __bch2_snapshot_parent_early(c, id);
-+	rcu_read_unlock();
-+
-+	return id;
-+}
-+
-+static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
-+{
-+#ifdef CONFIG_BCACHEFS_DEBUG
-+	u32 parent = snapshot_t(c, id)->parent;
-+
-+	if (parent &&
-+	    snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
-+		panic("id %u depth=%u parent %u depth=%u\n",
-+		      id, snapshot_t(c, id)->depth,
-+		      parent, snapshot_t(c, parent)->depth);
-+
-+	return parent;
-+#else
-+	return snapshot_t(c, id)->parent;
-+#endif
-+}
-+
-+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
-+{
-+	rcu_read_lock();
-+	id = __bch2_snapshot_parent(c, id);
-+	rcu_read_unlock();
-+
-+	return id;
-+}
-+
-+static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
-+{
-+	rcu_read_lock();
-+	while (n--)
-+		id = __bch2_snapshot_parent(c, id);
-+	rcu_read_unlock();
-+
-+	return id;
-+}
-+
-+static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
-+{
-+	u32 parent;
-+
-+	rcu_read_lock();
-+	while ((parent = __bch2_snapshot_parent(c, id)))
-+		id = parent;
-+	rcu_read_unlock();
-+
-+	return id;
-+}
-+
-+static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
-+{
-+	return snapshot_t(c, id)->equiv;
-+}
-+
-+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
-+{
-+	rcu_read_lock();
-+	id = __bch2_snapshot_equiv(c, id);
-+	rcu_read_unlock();
-+
-+	return id;
-+}
-+
-+static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
-+{
-+	return id == bch2_snapshot_equiv(c, id);
-+}
-+
-+static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
-+{
-+	const struct snapshot_t *s;
-+	bool ret;
-+
-+	rcu_read_lock();
-+	s = snapshot_t(c, id);
-+	ret = s->children[0];
-+	rcu_read_unlock();
-+
-+	return ret;
-+}
-+
-+static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
-+{
-+	return !bch2_snapshot_is_internal_node(c, id);
-+}
-+
-+static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
-+{
-+	const struct snapshot_t *s;
-+	u32 parent = __bch2_snapshot_parent(c, id);
-+
-+	if (!parent)
-+		return 0;
-+
-+	s = snapshot_t(c, __bch2_snapshot_parent(c, id));
-+	if (id == s->children[0])
-+		return s->children[1];
-+	if (id == s->children[1])
-+		return s->children[0];
-+	return 0;
-+}
-+
-+bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
-+
-+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-+{
-+	return id == ancestor
-+		? true
-+		: __bch2_snapshot_is_ancestor(c, id, ancestor);
-+}
-+
-+static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
-+{
-+	const struct snapshot_t *t;
-+	bool ret;
-+
-+	rcu_read_lock();
-+	t = snapshot_t(c, id);
-+	ret = (t->children[0]|t->children[1]) != 0;
-+	rcu_read_unlock();
-+
-+	return ret;
-+}
-+
-+static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
-+{
-+	u32 *i;
-+
-+	darray_for_each(*s, i)
-+		if (*i == id)
-+			return true;
-+	return false;
-+}
-+
-+static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
-+{
-+	u32 *i;
-+
-+	darray_for_each(*s, i)
-+		if (bch2_snapshot_is_ancestor(c, id, *i))
-+			return true;
-+	return false;
-+}
-+
-+static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
-+{
-+	int ret;
-+
-+	BUG_ON(snapshot_list_has_id(s, id));
-+	ret = darray_push(s, id);
-+	if (ret)
-+		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
-+	return ret;
-+}
-+
-+int bch2_check_snapshot_trees(struct bch_fs *);
-+int bch2_check_snapshots(struct bch_fs *);
 +int bch2_check_subvols(struct bch_fs *);
 +
-+void bch2_fs_snapshots_exit(struct bch_fs *);
-+int bch2_snapshots_read(struct bch_fs *);
-+
 +int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
 +			   unsigned, struct printbuf *);
 +void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -82811,14 +85704,8 @@ index 000000000..6905e91a9
 +
 +int bch2_subvolume_get(struct btree_trans *, unsigned,
 +		       bool, int, struct bch_subvolume *);
-+int bch2_snapshot_get_subvol(struct btree_trans *, u32,
-+			     struct bch_subvolume *);
 +int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 +
-+/* only exported for tests: */
-+int bch2_snapshot_node_create(struct btree_trans *, u32,
-+			      u32 *, u32 *, unsigned);
-+
 +int bch2_delete_dead_snapshots(struct bch_fs *);
 +void bch2_delete_dead_snapshots_async(struct bch_fs *);
 +
@@ -82868,15 +85755,13 @@ index 000000000..86833445a
 +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
 diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
 new file mode 100644
-index 000000000..d2d3eba4d
+index 000000000..f01883e78
 --- /dev/null
 +++ b/fs/bcachefs/super-io.c
-@@ -0,0 +1,1714 @@
+@@ -0,0 +1,1265 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
-+#include "btree_update_interior.h"
-+#include "buckets.h"
 +#include "checksum.h"
 +#include "counters.h"
 +#include "disk_groups.h"
@@ -82884,12 +85769,13 @@ index 000000000..d2d3eba4d
 +#include "error.h"
 +#include "io.h"
 +#include "journal.h"
-+#include "journal_io.h"
 +#include "journal_sb.h"
 +#include "journal_seq_blacklist.h"
 +#include "recovery.h"
 +#include "replicas.h"
 +#include "quota.h"
++#include "sb-clean.h"
++#include "sb-members.h"
 +#include "super-io.h"
 +#include "super.h"
 +#include "trace.h"
@@ -82898,6 +85784,9 @@ index 000000000..d2d3eba4d
 +#include <linux/backing-dev.h>
 +#include <linux/sort.h>
 +
++static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
++};
++
 +struct bch2_metadata_version {
 +	u16		version;
 +	const char	*name;
@@ -83036,7 +85925,8 @@ index 000000000..d2d3eba4d
 +{
 +	kfree(sb->bio);
 +	if (!IS_ERR_OR_NULL(sb->bdev))
-+		blkdev_put(sb->bdev, sb->mode);
++		blkdev_put(sb->bdev, sb->holder);
++	kfree(sb->holder);
 +
 +	kfree(sb->sb);
 +	memset(sb, 0, sizeof(*sb));
@@ -83073,8 +85963,14 @@ index 000000000..d2d3eba4d
 +	if (dynamic_fault("bcachefs:add:super_realloc"))
 +		return -BCH_ERR_ENOMEM_sb_realloc_injected;
 +
++	new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
++	if (!new_sb)
++		return -BCH_ERR_ENOMEM_sb_buf_realloc;
++
++	sb->sb = new_sb;
++
 +	if (sb->have_bio) {
-+		unsigned nr_bvecs = DIV_ROUND_UP(new_buffer_size, PAGE_SIZE);
++		unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size);
 +
 +		bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
 +		if (!bio)
@@ -83086,11 +85982,6 @@ index 000000000..d2d3eba4d
 +		sb->bio = bio;
 +	}
 +
-+	new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
-+	if (!new_sb)
-+		return -BCH_ERR_ENOMEM_sb_buf_realloc;
-+
-+	sb->sb = new_sb;
 +	sb->buffer_size = new_buffer_size;
 +
 +	return 0;
@@ -83135,16 +86026,13 @@ index 000000000..d2d3eba4d
 +
 +/* Superblock validate: */
 +
-+static inline void __bch2_sb_layout_size_assert(void)
-+{
-+	BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
-+}
-+
 +static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
 +{
 +	u64 offset, prev_offset, max_sectors;
 +	unsigned i;
 +
++	BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
++
 +	if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) &&
 +	    !uuid_equal(&layout->magic, &BCHFS_MAGIC)) {
 +		prt_printf(out, "Not a bcachefs superblock layout");
@@ -83425,7 +86313,9 @@ index 000000000..d2d3eba4d
 +		d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
 +		    (dst_f ? le32_to_cpu(dst_f->u64s) : 0);
 +		if (d > 0) {
-+			int ret = bch2_sb_realloc(dst_handle, le32_to_cpu(dst_handle->sb->u64s) + d);
++			int ret = bch2_sb_realloc(dst_handle,
++					le32_to_cpu(dst_handle->sb->u64s) + d);
++
 +			if (ret)
 +				return ret;
 +
@@ -83539,8 +86429,11 @@ index 000000000..d2d3eba4d
 +retry:
 +#endif
 +	memset(sb, 0, sizeof(*sb));
-+	sb->mode	= FMODE_READ;
++	sb->mode	= BLK_OPEN_READ;
 +	sb->have_bio	= true;
++	sb->holder	= kmalloc(1, GFP_KERNEL);
++	if (!sb->holder)
++		return -ENOMEM;
 +
 +#ifndef __KERNEL__
 +	if (opt_get(*opts, direct_io) == false)
@@ -83548,18 +86441,18 @@ index 000000000..d2d3eba4d
 +#endif
 +
 +	if (!opt_get(*opts, noexcl))
-+		sb->mode |= FMODE_EXCL;
++		sb->mode |= BLK_OPEN_EXCL;
 +
 +	if (!opt_get(*opts, nochanges))
-+		sb->mode |= FMODE_WRITE;
++		sb->mode |= BLK_OPEN_WRITE;
 +
-+	sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
++	sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
 +	if (IS_ERR(sb->bdev) &&
 +	    PTR_ERR(sb->bdev) == -EACCES &&
 +	    opt_get(*opts, read_only)) {
-+		sb->mode &= ~FMODE_WRITE;
++		sb->mode &= ~BLK_OPEN_WRITE;
 +
-+		sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
++		sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
 +		if (!IS_ERR(sb->bdev))
 +			opt_set(*opts, nochanges, true);
 +	}
@@ -83882,235 +86775,6 @@ index 000000000..d2d3eba4d
 +	mutex_unlock(&c->sb_lock);
 +}
 +
-+/* BCH_SB_FIELD_members: */
-+
-+static int bch2_sb_members_validate(struct bch_sb *sb,
-+				    struct bch_sb_field *f,
-+				    struct printbuf *err)
-+{
-+	struct bch_sb_field_members *mi = field_to_type(f, members);
-+	unsigned i;
-+
-+	if ((void *) (mi->members + sb->nr_devices) >
-+	    vstruct_end(&mi->field)) {
-+		prt_printf(err, "too many devices for section size");
-+		return -BCH_ERR_invalid_sb_members;
-+	}
-+
-+	for (i = 0; i < sb->nr_devices; i++) {
-+		struct bch_member *m = mi->members + i;
-+
-+		if (!bch2_member_exists(m))
-+			continue;
-+
-+		if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
-+			prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
-+			       i, le64_to_cpu(m->nbuckets), LONG_MAX);
-+			return -BCH_ERR_invalid_sb_members;
-+		}
-+
-+		if (le64_to_cpu(m->nbuckets) -
-+		    le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
-+			prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
-+			       i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
-+			return -BCH_ERR_invalid_sb_members;
-+		}
-+
-+		if (le16_to_cpu(m->bucket_size) <
-+		    le16_to_cpu(sb->block_size)) {
-+			prt_printf(err, "device %u: bucket size %u smaller than block size %u",
-+			       i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
-+			return -BCH_ERR_invalid_sb_members;
-+		}
-+
-+		if (le16_to_cpu(m->bucket_size) <
-+		    BCH_SB_BTREE_NODE_SIZE(sb)) {
-+			prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
-+			       i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
-+			return -BCH_ERR_invalid_sb_members;
-+		}
-+	}
-+
-+	return 0;
-+}
-+
-+static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
-+				    struct bch_sb_field *f)
-+{
-+	struct bch_sb_field_members *mi = field_to_type(f, members);
-+	struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
-+	unsigned i;
-+
-+	for (i = 0; i < sb->nr_devices; i++) {
-+		struct bch_member *m = mi->members + i;
-+		unsigned data_have = bch2_sb_dev_has_data(sb, i);
-+		u64 bucket_size = le16_to_cpu(m->bucket_size);
-+		u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
-+
-+		if (!bch2_member_exists(m))
-+			continue;
-+
-+		prt_printf(out, "Device:");
-+		prt_tab(out);
-+		prt_printf(out, "%u", i);
-+		prt_newline(out);
-+
-+		printbuf_indent_add(out, 2);
-+
-+		prt_printf(out, "UUID:");
-+		prt_tab(out);
-+		pr_uuid(out, m->uuid.b);
-+		prt_newline(out);
-+
-+		prt_printf(out, "Size:");
-+		prt_tab(out);
-+		prt_units_u64(out, device_size << 9);
-+		prt_newline(out);
-+
-+		prt_printf(out, "Bucket size:");
-+		prt_tab(out);
-+		prt_units_u64(out, bucket_size << 9);
-+		prt_newline(out);
-+
-+		prt_printf(out, "First bucket:");
-+		prt_tab(out);
-+		prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
-+		prt_newline(out);
-+
-+		prt_printf(out, "Buckets:");
-+		prt_tab(out);
-+		prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
-+		prt_newline(out);
-+
-+		prt_printf(out, "Last mount:");
-+		prt_tab(out);
-+		if (m->last_mount)
-+			pr_time(out, le64_to_cpu(m->last_mount));
-+		else
-+			prt_printf(out, "(never)");
-+		prt_newline(out);
-+
-+		prt_printf(out, "State:");
-+		prt_tab(out);
-+		prt_printf(out, "%s",
-+		       BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
-+		       ? bch2_member_states[BCH_MEMBER_STATE(m)]
-+		       : "unknown");
-+		prt_newline(out);
-+
-+		prt_printf(out, "Label:");
-+		prt_tab(out);
-+		if (BCH_MEMBER_GROUP(m)) {
-+			unsigned idx = BCH_MEMBER_GROUP(m) - 1;
-+
-+			if (idx < disk_groups_nr(gi))
-+				prt_printf(out, "%s (%u)",
-+				       gi->entries[idx].label, idx);
-+			else
-+				prt_printf(out, "(bad disk labels section)");
-+		} else {
-+			prt_printf(out, "(none)");
-+		}
-+		prt_newline(out);
-+
-+		prt_printf(out, "Data allowed:");
-+		prt_tab(out);
-+		if (BCH_MEMBER_DATA_ALLOWED(m))
-+			prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
-+		else
-+			prt_printf(out, "(none)");
-+		prt_newline(out);
-+
-+		prt_printf(out, "Has data:");
-+		prt_tab(out);
-+		if (data_have)
-+			prt_bitflags(out, bch2_data_types, data_have);
-+		else
-+			prt_printf(out, "(none)");
-+		prt_newline(out);
-+
-+		prt_printf(out, "Discard:");
-+		prt_tab(out);
-+		prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
-+		prt_newline(out);
-+
-+		prt_printf(out, "Freespace initialized:");
-+		prt_tab(out);
-+		prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
-+		prt_newline(out);
-+
-+		printbuf_indent_sub(out, 2);
-+	}
-+}
-+
-+static const struct bch_sb_field_ops bch_sb_field_ops_members = {
-+	.validate	= bch2_sb_members_validate,
-+	.to_text	= bch2_sb_members_to_text,
-+};
-+
-+/* BCH_SB_FIELD_crypt: */
-+
-+static int bch2_sb_crypt_validate(struct bch_sb *sb,
-+				  struct bch_sb_field *f,
-+				  struct printbuf *err)
-+{
-+	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-+
-+	if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
-+		prt_printf(err, "wrong size (got %zu should be %zu)",
-+		       vstruct_bytes(&crypt->field), sizeof(*crypt));
-+		return -BCH_ERR_invalid_sb_crypt;
-+	}
-+
-+	if (BCH_CRYPT_KDF_TYPE(crypt)) {
-+		prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
-+		return -BCH_ERR_invalid_sb_crypt;
-+	}
-+
-+	return 0;
-+}
-+
-+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
-+				  struct bch_sb_field *f)
-+{
-+	struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-+
-+	prt_printf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
-+	prt_newline(out);
-+	prt_printf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
-+	prt_newline(out);
-+	prt_printf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
-+	prt_newline(out);
-+	prt_printf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
-+	prt_newline(out);
-+}
-+
-+static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
-+	.validate	= bch2_sb_crypt_validate,
-+	.to_text	= bch2_sb_crypt_to_text,
-+};
-+
-+/* BCH_SB_FIELD_clean: */
-+
-+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
-+{
-+	struct jset_entry *entry;
-+	int ret;
-+
-+	for (entry = clean->start;
-+	     entry < (struct jset_entry *) vstruct_end(&clean->field);
-+	     entry = vstruct_next(entry)) {
-+		ret = bch2_journal_entry_validate(c, NULL, entry,
-+						  le16_to_cpu(c->disk_sb.sb->version),
-+						  BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
-+						  write);
-+		if (ret)
-+			return ret;
-+	}
-+
-+	return 0;
-+}
-+
 +/* Downgrade if superblock is at a higher version than currently supported: */
 +void bch2_sb_maybe_downgrade(struct bch_fs *c)
 +{
@@ -84137,232 +86801,6 @@ index 000000000..d2d3eba4d
 +	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 +}
 +
-+int bch2_fs_mark_dirty(struct bch_fs *c)
-+{
-+	int ret;
-+
-+	/*
-+	 * Unconditionally write superblock, to verify it hasn't changed before
-+	 * we go rw:
-+	 */
-+
-+	mutex_lock(&c->sb_lock);
-+	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-+
-+	bch2_sb_maybe_downgrade(c);
-+	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
-+
-+	ret = bch2_write_super(c);
-+	mutex_unlock(&c->sb_lock);
-+
-+	return ret;
-+}
-+
-+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-+{
-+	struct jset_entry *entry = *end;
-+	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-+
-+	memset(entry, 0, u64s * sizeof(u64));
-+	/*
-+	 * The u64s field counts from the start of data, ignoring the shared
-+	 * fields.
-+	 */
-+	entry->u64s = cpu_to_le16(u64s - 1);
-+
-+	*end = vstruct_next(*end);
-+	return entry;
-+}
-+
-+void bch2_journal_super_entries_add_common(struct bch_fs *c,
-+					   struct jset_entry **end,
-+					   u64 journal_seq)
-+{
-+	struct bch_dev *ca;
-+	unsigned i, dev;
-+
-+	percpu_down_read(&c->mark_lock);
-+
-+	if (!journal_seq) {
-+		for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-+			bch2_fs_usage_acc_to_base(c, i);
-+	} else {
-+		bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
-+	}
-+
-+	{
-+		struct jset_entry_usage *u =
-+			container_of(jset_entry_init(end, sizeof(*u)),
-+				     struct jset_entry_usage, entry);
-+
-+		u->entry.type	= BCH_JSET_ENTRY_usage;
-+		u->entry.btree_id = BCH_FS_USAGE_inodes;
-+		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
-+	}
-+
-+	{
-+		struct jset_entry_usage *u =
-+			container_of(jset_entry_init(end, sizeof(*u)),
-+				     struct jset_entry_usage, entry);
-+
-+		u->entry.type	= BCH_JSET_ENTRY_usage;
-+		u->entry.btree_id = BCH_FS_USAGE_key_version;
-+		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
-+	}
-+
-+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
-+		struct jset_entry_usage *u =
-+			container_of(jset_entry_init(end, sizeof(*u)),
-+				     struct jset_entry_usage, entry);
-+
-+		u->entry.type	= BCH_JSET_ENTRY_usage;
-+		u->entry.btree_id = BCH_FS_USAGE_reserved;
-+		u->entry.level	= i;
-+		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
-+	}
-+
-+	for (i = 0; i < c->replicas.nr; i++) {
-+		struct bch_replicas_entry *e =
-+			cpu_replicas_entry(&c->replicas, i);
-+		struct jset_entry_data_usage *u =
-+			container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
-+				     struct jset_entry_data_usage, entry);
-+
-+		u->entry.type	= BCH_JSET_ENTRY_data_usage;
-+		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
-+		unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
-+			      "embedded variable length struct");
-+	}
-+
-+	for_each_member_device(ca, c, dev) {
-+		unsigned b = sizeof(struct jset_entry_dev_usage) +
-+			sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
-+		struct jset_entry_dev_usage *u =
-+			container_of(jset_entry_init(end, b),
-+				     struct jset_entry_dev_usage, entry);
-+
-+		u->entry.type = BCH_JSET_ENTRY_dev_usage;
-+		u->dev = cpu_to_le32(dev);
-+		u->buckets_ec		= cpu_to_le64(ca->usage_base->buckets_ec);
-+
-+		for (i = 0; i < BCH_DATA_NR; i++) {
-+			u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
-+			u->d[i].sectors	= cpu_to_le64(ca->usage_base->d[i].sectors);
-+			u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
-+		}
-+	}
-+
-+	percpu_up_read(&c->mark_lock);
-+
-+	for (i = 0; i < 2; i++) {
-+		struct jset_entry_clock *clock =
-+			container_of(jset_entry_init(end, sizeof(*clock)),
-+				     struct jset_entry_clock, entry);
-+
-+		clock->entry.type = BCH_JSET_ENTRY_clock;
-+		clock->rw	= i;
-+		clock->time	= cpu_to_le64(atomic64_read(&c->io_clock[i].now));
-+	}
-+}
-+
-+void bch2_fs_mark_clean(struct bch_fs *c)
-+{
-+	struct bch_sb_field_clean *sb_clean;
-+	struct jset_entry *entry;
-+	unsigned u64s;
-+	int ret;
-+
-+	mutex_lock(&c->sb_lock);
-+	if (BCH_SB_CLEAN(c->disk_sb.sb))
-+		goto out;
-+
-+	SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
-+
-+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
-+	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
-+	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
-+	c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
-+
-+	u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
-+
-+	sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
-+	if (!sb_clean) {
-+		bch_err(c, "error resizing superblock while setting filesystem clean");
-+		goto out;
-+	}
-+
-+	sb_clean->flags		= 0;
-+	sb_clean->journal_seq	= cpu_to_le64(atomic64_read(&c->journal.seq));
-+
-+	/* Trying to catch outstanding bug: */
-+	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
-+
-+	entry = sb_clean->start;
-+	bch2_journal_super_entries_add_common(c, &entry, 0);
-+	entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
-+	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
-+
-+	memset(entry, 0,
-+	       vstruct_end(&sb_clean->field) - (void *) entry);
-+
-+	/*
-+	 * this should be in the write path, and we should be validating every
-+	 * superblock section:
-+	 */
-+	ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
-+	if (ret) {
-+		bch_err(c, "error writing marking filesystem clean: validate error");
-+		goto out;
-+	}
-+
-+	bch2_write_super(c);
-+out:
-+	mutex_unlock(&c->sb_lock);
-+}
-+
-+static int bch2_sb_clean_validate(struct bch_sb *sb,
-+				  struct bch_sb_field *f,
-+				  struct printbuf *err)
-+{
-+	struct bch_sb_field_clean *clean = field_to_type(f, clean);
-+
-+	if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
-+		prt_printf(err, "wrong size (got %zu should be %zu)",
-+		       vstruct_bytes(&clean->field), sizeof(*clean));
-+		return -BCH_ERR_invalid_sb_clean;
-+	}
-+
-+	return 0;
-+}
-+
-+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
-+				  struct bch_sb_field *f)
-+{
-+	struct bch_sb_field_clean *clean = field_to_type(f, clean);
-+	struct jset_entry *entry;
-+
-+	prt_printf(out, "flags:          %x",	le32_to_cpu(clean->flags));
-+	prt_newline(out);
-+	prt_printf(out, "journal_seq:    %llu",	le64_to_cpu(clean->journal_seq));
-+	prt_newline(out);
-+
-+	for (entry = clean->start;
-+	     entry != vstruct_end(&clean->field);
-+	     entry = vstruct_next(entry)) {
-+		if (entry->type == BCH_JSET_ENTRY_btree_keys &&
-+		    !entry->u64s)
-+			continue;
-+
-+		bch2_journal_entry_to_text(out, NULL, entry);
-+		prt_newline(out);
-+	}
-+}
-+
-+static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
-+	.validate	= bch2_sb_clean_validate,
-+	.to_text	= bch2_sb_clean_to_text,
-+};
-+
 +static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
 +#define x(f, nr)					\
 +	[BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
@@ -84588,10 +87026,10 @@ index 000000000..d2d3eba4d
 +}
 diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
 new file mode 100644
-index 000000000..904adea6a
+index 000000000..d51c0a195
 --- /dev/null
 +++ b/fs/bcachefs/super-io.h
-@@ -0,0 +1,142 @@
+@@ -0,0 +1,133 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_SUPER_IO_H
 +#define _BCACHEFS_SUPER_IO_H
@@ -84652,6 +87090,7 @@ index 000000000..904adea6a
 +static inline __le64 bch2_sb_magic(struct bch_fs *c)
 +{
 +	__le64 ret;
++
 +	memcpy(&ret, &c->sb.uuid, sizeof(ret));
 +	return ret;
 +}
@@ -84715,19 +87154,9 @@ index 000000000..904adea6a
 +	};
 +}
 +
-+/* BCH_SB_FIELD_clean: */
-+
-+void bch2_journal_super_entries_add_common(struct bch_fs *,
-+					   struct jset_entry **, u64);
-+
-+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
-+
 +void bch2_sb_maybe_downgrade(struct bch_fs *);
 +void bch2_sb_upgrade(struct bch_fs *, unsigned);
 +
-+int bch2_fs_mark_dirty(struct bch_fs *);
-+void bch2_fs_mark_clean(struct bch_fs *);
-+
 +void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
 +			   struct bch_sb_field *);
 +void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
@@ -84736,10 +87165,10 @@ index 000000000..904adea6a
 +#endif /* _BCACHEFS_SUPER_IO_H */
 diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
 new file mode 100644
-index 000000000..eee56969c
+index 000000000..604248659
 --- /dev/null
 +++ b/fs/bcachefs/super.c
-@@ -0,0 +1,2007 @@
+@@ -0,0 +1,2015 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * bcachefs setup/teardown code, and some metadata io - read a superblock and
@@ -84755,6 +87184,7 @@ index 000000000..eee56969c
 +#include "bkey_sort.h"
 +#include "btree_cache.h"
 +#include "btree_gc.h"
++#include "btree_journal_iter.h"
 +#include "btree_key_cache.h"
 +#include "btree_update_interior.h"
 +#include "btree_io.h"
@@ -84772,6 +87202,8 @@ index 000000000..eee56969c
 +#include "error.h"
 +#include "fs.h"
 +#include "fs-io.h"
++#include "fs-io-buffered.h"
++#include "fs-io-direct.h"
 +#include "fsck.h"
 +#include "inode.h"
 +#include "io.h"
@@ -84786,6 +87218,8 @@ index 000000000..eee56969c
 +#include "rebalance.h"
 +#include "recovery.h"
 +#include "replicas.h"
++#include "sb-clean.h"
++#include "snapshot.h"
 +#include "subvolume.h"
 +#include "super.h"
 +#include "super-io.h"
@@ -85211,6 +87645,8 @@ index 000000000..eee56969c
 +	bch2_fs_counters_exit(c);
 +	bch2_fs_snapshots_exit(c);
 +	bch2_fs_quota_exit(c);
++	bch2_fs_fs_io_direct_exit(c);
++	bch2_fs_fs_io_buffered_exit(c);
 +	bch2_fs_fsio_exit(c);
 +	bch2_fs_ec_exit(c);
 +	bch2_fs_encryption_exit(c);
@@ -85310,19 +87746,14 @@ index 000000000..eee56969c
 +		cancel_work_sync(&ca->io_error_work);
 +
 +	cancel_work_sync(&c->read_only_work);
-+
-+	for (i = 0; i < c->sb.nr_devices; i++) {
-+		struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
-+
-+		if (ca)
-+			bch2_free_super(&ca->disk_sb);
-+	}
 +}
 +
 +void bch2_fs_free(struct bch_fs *c)
 +{
 +	unsigned i;
 +
++	BUG_ON(!test_bit(BCH_FS_STOPPING, &c->flags));
++
 +	mutex_lock(&bch_fs_list_lock);
 +	list_del(&c->list);
 +	mutex_unlock(&bch_fs_list_lock);
@@ -85330,9 +87761,14 @@ index 000000000..eee56969c
 +	closure_sync(&c->cl);
 +	closure_debug_destroy(&c->cl);
 +
-+	for (i = 0; i < c->sb.nr_devices; i++)
-+		if (c->devs[i])
-+			bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
++	for (i = 0; i < c->sb.nr_devices; i++) {
++		struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
++
++		if (ca) {
++			bch2_free_super(&ca->disk_sb);
++			bch2_dev_free(ca);
++		}
++	}
 +
 +	bch_verbose(c, "shutdown complete");
 +
@@ -85586,7 +88022,9 @@ index 000000000..eee56969c
 +	    bch2_fs_encryption_init(c) ?:
 +	    bch2_fs_compress_init(c) ?:
 +	    bch2_fs_ec_init(c) ?:
-+	    bch2_fs_fsio_init(c);
++	    bch2_fs_fsio_init(c) ?:
++	    bch2_fs_fs_io_buffered_init(c);
++	    bch2_fs_fs_io_direct_init(c);
 +	if (ret)
 +		goto err;
 +
@@ -85970,8 +88408,6 @@ index 000000000..eee56969c
 +
 +	/* Commit: */
 +	ca->disk_sb = *sb;
-+	if (sb->mode & FMODE_EXCL)
-+		ca->disk_sb.bdev->bd_holder = ca;
 +	memset(sb, 0, sizeof(*sb));
 +
 +	ca->dev = ca->disk_sb.bdev->bd_dev;
@@ -86742,6 +89178,7 @@ index 000000000..eee56969c
 +BCH_DEBUG_PARAMS()
 +#undef BCH_DEBUG_PARAM
 +
++__maybe_unused
 +static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
 +module_param_named(version, bch2_metadata_version, uint, 0400);
 +
@@ -86749,10 +89186,10 @@ index 000000000..eee56969c
 +module_init(bcachefs_init);
 diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
 new file mode 100644
-index 000000000..36bcb9ec2
+index 000000000..bf762df18
 --- /dev/null
 +++ b/fs/bcachefs/super.h
-@@ -0,0 +1,266 @@
+@@ -0,0 +1,52 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_SUPER_H
 +#define _BCACHEFS_SUPER_H
@@ -86763,220 +89200,6 @@ index 000000000..36bcb9ec2
 +
 +#include <linux/math64.h>
 +
-+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
-+{
-+	return div_u64(s, ca->mi.bucket_size);
-+}
-+
-+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
-+{
-+	return ((sector_t) b) * ca->mi.bucket_size;
-+}
-+
-+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
-+{
-+	u32 remainder;
-+
-+	div_u64_rem(s, ca->mi.bucket_size, &remainder);
-+	return remainder;
-+}
-+
-+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
-+						 u32 *offset)
-+{
-+	return div_u64_rem(s, ca->mi.bucket_size, offset);
-+}
-+
-+static inline bool bch2_dev_is_online(struct bch_dev *ca)
-+{
-+	return !percpu_ref_is_zero(&ca->io_ref);
-+}
-+
-+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
-+{
-+	return bch2_dev_is_online(ca) &&
-+		ca->mi.state != BCH_MEMBER_STATE_failed;
-+}
-+
-+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
-+{
-+	if (!percpu_ref_tryget(&ca->io_ref))
-+		return false;
-+
-+	if (ca->mi.state == BCH_MEMBER_STATE_rw ||
-+	    (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
-+		return true;
-+
-+	percpu_ref_put(&ca->io_ref);
-+	return false;
-+}
-+
-+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
-+{
-+	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
-+}
-+
-+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
-+					 unsigned dev)
-+{
-+	unsigned i;
-+
-+	for (i = 0; i < devs.nr; i++)
-+		if (devs.devs[i] == dev)
-+			return true;
-+
-+	return false;
-+}
-+
-+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
-+					  unsigned dev)
-+{
-+	unsigned i;
-+
-+	for (i = 0; i < devs->nr; i++)
-+		if (devs->devs[i] == dev) {
-+			array_remove_item(devs->devs, devs->nr, i);
-+			return;
-+		}
-+}
-+
-+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
-+					 unsigned dev)
-+{
-+	if (!bch2_dev_list_has_dev(*devs, dev)) {
-+		BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
-+		devs->devs[devs->nr++] = dev;
-+	}
-+}
-+
-+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
-+{
-+	return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
-+}
-+
-+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
-+					      const struct bch_devs_mask *mask)
-+{
-+	struct bch_dev *ca = NULL;
-+
-+	while ((*iter = mask
-+		? find_next_bit(mask->d, c->sb.nr_devices, *iter)
-+		: *iter) < c->sb.nr_devices &&
-+	       !(ca = rcu_dereference_check(c->devs[*iter],
-+					    lockdep_is_held(&c->state_lock))))
-+		(*iter)++;
-+
-+	return ca;
-+}
-+
-+#define for_each_member_device_rcu(ca, c, iter, mask)			\
-+	for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
-+
-+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
-+{
-+	struct bch_dev *ca;
-+
-+	rcu_read_lock();
-+	if ((ca = __bch2_next_dev(c, iter, NULL)))
-+		percpu_ref_get(&ca->ref);
-+	rcu_read_unlock();
-+
-+	return ca;
-+}
-+
-+/*
-+ * If you break early, you must drop your ref on the current device
-+ */
-+#define for_each_member_device(ca, c, iter)				\
-+	for ((iter) = 0;						\
-+	     (ca = bch2_get_next_dev(c, &(iter)));			\
-+	     percpu_ref_put(&ca->ref), (iter)++)
-+
-+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
-+						      unsigned *iter,
-+						      int state_mask)
-+{
-+	struct bch_dev *ca;
-+
-+	rcu_read_lock();
-+	while ((ca = __bch2_next_dev(c, iter, NULL)) &&
-+	       (!((1 << ca->mi.state) & state_mask) ||
-+		!percpu_ref_tryget(&ca->io_ref)))
-+		(*iter)++;
-+	rcu_read_unlock();
-+
-+	return ca;
-+}
-+
-+#define __for_each_online_member(ca, c, iter, state_mask)		\
-+	for ((iter) = 0;						\
-+	     (ca = bch2_get_next_online_dev(c, &(iter), state_mask));	\
-+	     percpu_ref_put(&ca->io_ref), (iter)++)
-+
-+#define for_each_online_member(ca, c, iter)				\
-+	__for_each_online_member(ca, c, iter, ~0)
-+
-+#define for_each_rw_member(ca, c, iter)					\
-+	__for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
-+
-+#define for_each_readable_member(ca, c, iter)				\
-+	__for_each_online_member(ca, c, iter,				\
-+		(1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
-+
-+/*
-+ * If a key exists that references a device, the device won't be going away and
-+ * we can omit rcu_read_lock():
-+ */
-+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
-+{
-+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
-+
-+	return rcu_dereference_check(c->devs[idx], 1);
-+}
-+
-+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
-+{
-+	EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
-+
-+	return rcu_dereference_protected(c->devs[idx],
-+					 lockdep_is_held(&c->sb_lock) ||
-+					 lockdep_is_held(&c->state_lock));
-+}
-+
-+/* XXX kill, move to struct bch_fs */
-+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
-+{
-+	struct bch_devs_mask devs;
-+	struct bch_dev *ca;
-+	unsigned i;
-+
-+	memset(&devs, 0, sizeof(devs));
-+	for_each_online_member(ca, c, i)
-+		__set_bit(ca->dev_idx, devs.d);
-+	return devs;
-+}
-+
-+static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
-+{
-+	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
-+	u64 b_offset	= bucket_to_sector(ca, b);
-+	u64 b_end	= bucket_to_sector(ca, b + 1);
-+	unsigned i;
-+
-+	if (!b)
-+		return true;
-+
-+	for (i = 0; i < layout->nr_superblocks; i++) {
-+		u64 offset = le64_to_cpu(layout->sb_offset[i]);
-+		u64 end = offset + (1 << layout->sb_max_size_bits);
-+
-+		if (!(offset >= b_end || end <= b_offset))
-+			return true;
-+	}
-+
-+	return false;
-+}
-+
 +struct bch_fs *bch2_dev_to_fs(dev_t);
 +struct bch_fs *bch2_uuid_to_fs(__uuid_t);
 +
@@ -87021,10 +89244,10 @@ index 000000000..36bcb9ec2
 +#endif /* _BCACHEFS_SUPER_H */
 diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
 new file mode 100644
-index 000000000..89419fc79
+index 000000000..08faeedba
 --- /dev/null
 +++ b/fs/bcachefs/super_types.h
-@@ -0,0 +1,51 @@
+@@ -0,0 +1,52 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_SUPER_TYPES_H
 +#define _BCACHEFS_SUPER_TYPES_H
@@ -87033,6 +89256,7 @@ index 000000000..89419fc79
 +	struct bch_sb		*sb;
 +	struct block_device	*bdev;
 +	struct bio		*bio;
++	void			*holder;
 +	size_t			buffer_size;
 +	fmode_t			mode;
 +	unsigned		have_layout:1;
@@ -87078,10 +89302,10 @@ index 000000000..89419fc79
 +#endif /* _BCACHEFS_SUPER_TYPES_H */
 diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
 new file mode 100644
-index 000000000..740305e67
+index 000000000..941f4bcb9
 --- /dev/null
 +++ b/fs/bcachefs/sysfs.c
-@@ -0,0 +1,1064 @@
+@@ -0,0 +1,1059 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * bcache sysfs interfaces
@@ -87332,7 +89556,6 @@ index 000000000..740305e67
 +read_attribute(io_timers_read);
 +read_attribute(io_timers_write);
 +
-+read_attribute(data_jobs);
 +read_attribute(moving_ctxts);
 +
 +#ifdef CONFIG_BCACHEFS_TESTS
@@ -87542,9 +89765,6 @@ index 000000000..740305e67
 +	if (attr == &sysfs_io_timers_write)
 +		bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
 +
-+	if (attr == &sysfs_data_jobs)
-+		bch2_data_jobs_to_text(out, c);
-+
 +	if (attr == &sysfs_moving_ctxts)
 +		bch2_fs_moving_ctxts_to_text(out, c);
 +
@@ -87765,7 +89985,6 @@ index 000000000..740305e67
 +	&sysfs_rebalance_work,
 +	sysfs_pd_controller_files(rebalance),
 +
-+	&sysfs_data_jobs,
 +	&sysfs_moving_ctxts,
 +
 +	&sysfs_internal_uuid,
@@ -88202,17 +90421,17 @@ index 000000000..222cd5062
 +#endif  /* _BCACHEFS_SYSFS_H_ */
 diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
 new file mode 100644
-index 000000000..cef23d2cc
+index 000000000..72389c737
 --- /dev/null
 +++ b/fs/bcachefs/tests.c
-@@ -0,0 +1,939 @@
+@@ -0,0 +1,970 @@
 +// SPDX-License-Identifier: GPL-2.0
 +#ifdef CONFIG_BCACHEFS_TESTS
 +
 +#include "bcachefs.h"
 +#include "btree_update.h"
 +#include "journal_reclaim.h"
-+#include "subvolume.h"
++#include "snapshot.h"
 +#include "tests.h"
 +
 +#include "linux/kthread.h"
@@ -88711,6 +90930,36 @@ index 000000000..cef23d2cc
 +		__test_extent_overwrite(c, 32, 64, 32, 128);
 +}
 +
++static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid)
++{
++	struct bkey_i_cookie k;
++	int ret;
++
++	bkey_cookie_init(&k.k_i);
++	k.k_i.k.p.inode	= inum;
++	k.k_i.k.p.offset = start + len;
++	k.k_i.k.p.snapshot = snapid;
++	k.k_i.k.size = len;
++
++	ret = bch2_trans_do(c, NULL, NULL, 0,
++		bch2_btree_insert_nonextent(&trans, BTREE_ID_extents, &k.k_i,
++					    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
++	if (ret)
++		bch_err_fn(c, ret);
++	return ret;
++}
++
++static int test_extent_create_overlapping(struct bch_fs *c, u64 inum)
++{
++	return  insert_test_overlapping_extent(c, inum,  0, 16, U32_MAX - 2) ?: /* overwrite entire */
++		insert_test_overlapping_extent(c, inum,  2,  8, U32_MAX - 2) ?:
++		insert_test_overlapping_extent(c, inum,  4,  4, U32_MAX) ?:
++		insert_test_overlapping_extent(c, inum, 32,  8, U32_MAX - 2) ?: /* overwrite front/back */
++		insert_test_overlapping_extent(c, inum, 36,  8, U32_MAX) ?:
++		insert_test_overlapping_extent(c, inum, 60,  8, U32_MAX - 2) ?:
++		insert_test_overlapping_extent(c, inum, 64,  8, U32_MAX);
++}
++
 +/* snapshot unit tests */
 +
 +/* Test skipping over keys in unrelated snapshots: */
@@ -89109,6 +91358,7 @@ index 000000000..cef23d2cc
 +	perf_test(test_extent_overwrite_back);
 +	perf_test(test_extent_overwrite_middle);
 +	perf_test(test_extent_overwrite_all);
++	perf_test(test_extent_create_overlapping);
 +
 +	perf_test(test_snapshots);
 +
@@ -89168,7 +91418,7 @@ index 000000000..c73b18aea
 +#endif /* _BCACHEFS_TEST_H */
 diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
 new file mode 100644
-index 000000000..d294b3d71
+index 000000000..33efa6005
 --- /dev/null
 +++ b/fs/bcachefs/trace.c
 @@ -0,0 +1,16 @@
@@ -89182,18 +91432,18 @@ index 000000000..d294b3d71
 +#include "btree_update_interior.h"
 +#include "keylist.h"
 +#include "opts.h"
++#include "six.h"
 +
 +#include <linux/blktrace_api.h>
-+#include <linux/six.h>
 +
 +#define CREATE_TRACE_POINTS
 +#include "trace.h"
 diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
 new file mode 100644
-index 000000000..a743ab477
+index 000000000..97fe77423
 --- /dev/null
 +++ b/fs/bcachefs/trace.h
-@@ -0,0 +1,1247 @@
+@@ -0,0 +1,1265 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#undef TRACE_SYSTEM
 +#define TRACE_SYSTEM bcachefs
@@ -89599,29 +91849,43 @@ index 000000000..a743ab477
 +		__field(u8,			level		)
 +		TRACE_BPOS_entries(pos)
 +		__array(char,			node, 24	)
++		__field(u8,			self_read_count	)
++		__field(u8,			self_intent_count)
++		__field(u8,			read_count	)
++		__field(u8,			intent_count	)
 +		__field(u32,			iter_lock_seq	)
 +		__field(u32,			node_lock_seq	)
 +	),
 +
 +	TP_fast_assign(
 +		struct btree *b = btree_path_node(path, level);
++		struct six_lock_count c;
 +
 +		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 +		__entry->caller_ip		= caller_ip;
 +		__entry->btree_id		= path->btree_id;
 +		__entry->level			= path->level;
 +		TRACE_BPOS_assign(pos, path->pos);
-+		if (IS_ERR(b))
++
++		c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
++		__entry->self_read_count	= c.n[SIX_LOCK_read];
++		__entry->self_intent_count	= c.n[SIX_LOCK_intent];
++
++		if (IS_ERR(b)) {
 +			strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
-+		else
++		} else {
++			c = six_lock_counts(&path->l[level].b->c.lock);
++			__entry->read_count	= c.n[SIX_LOCK_read];
++			__entry->intent_count	= c.n[SIX_LOCK_intent];
 +			scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
++		}
 +		__entry->iter_lock_seq		= path->l[level].lock_seq;
 +		__entry->node_lock_seq		= is_btree_node(path, level)
 +			? six_lock_seq(&path->l[level].b->c.lock)
 +			: 0;
 +	),
 +
-+	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u",
++	TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
 +		  __entry->trans_fn,
 +		  (void *) __entry->caller_ip,
 +		  bch2_btree_ids[__entry->btree_id],
@@ -89630,6 +91894,10 @@ index 000000000..a743ab477
 +		  __entry->pos_snapshot,
 +		  __entry->level,
 +		  __entry->node,
++		  __entry->self_read_count,
++		  __entry->self_intent_count,
++		  __entry->read_count,
++		  __entry->intent_count,
 +		  __entry->iter_lock_seq,
 +		  __entry->node_lock_seq)
 +);
@@ -89671,7 +91939,7 @@ index 000000000..a743ab477
 +		__entry->self_intent_count	= c.n[SIX_LOCK_intent];
 +		c = six_lock_counts(&path->l[level].b->c.lock);
 +		__entry->read_count		= c.n[SIX_LOCK_read];
-+		__entry->intent_count		= c.n[SIX_LOCK_read];
++		__entry->intent_count		= c.n[SIX_LOCK_intent];
 +		__entry->iter_lock_seq		= path->l[level].lock_seq;
 +		__entry->node_lock_seq		= is_btree_node(path, level)
 +			? six_lock_seq(&path->l[level].b->c.lock)
@@ -90522,10 +92790,10 @@ index 000000000..905801772
 +#endif /* _BCACHEFS_TWO_STATE_LOCK_H */
 diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
 new file mode 100644
-index 000000000..ae4f6de3c
+index 000000000..636f1fa42
 --- /dev/null
 +++ b/fs/bcachefs/util.c
-@@ -0,0 +1,1137 @@
+@@ -0,0 +1,1144 @@
 +// SPDX-License-Identifier: GPL-2.0
 +/*
 + * random utiility code, for bcache but in theory not specific to bcache
@@ -90744,6 +93012,7 @@ index 000000000..ae4f6de3c
 +
 +	while ((p = strsep(&s, ","))) {
 +		int flag = match_string(list, -1, p);
++
 +		if (flag < 0) {
 +			ret = -1;
 +			break;
@@ -90796,6 +93065,7 @@ index 000000000..ae4f6de3c
 +
 +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
 +{
++#ifdef CONFIG_STACKTRACE
 +	unsigned nr_entries = 0;
 +	int ret = 0;
 +
@@ -90816,6 +93086,9 @@ index 000000000..ae4f6de3c
 +	up_read(&task->signal->exec_update_lock);
 +
 +	return ret;
++#else
++	return 0;
++#endif
 +}
 +
 +void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
@@ -91284,10 +93557,10 @@ index 000000000..ae4f6de3c
 +	}
 +}
 +
-+int bch2_bio_alloc_pages_noprof(struct bio *bio, size_t size, gfp_t gfp_mask)
++int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
 +{
 +	while (size) {
-+		struct page *page = alloc_pages_noprof(gfp_mask, 0);
++		struct page *page = alloc_pages(gfp_mask, 0);
 +		unsigned len = min_t(size_t, PAGE_SIZE, size);
 +
 +		if (!page)
@@ -91325,9 +93598,10 @@ index 000000000..ae4f6de3c
 +	struct bvec_iter iter;
 +
 +	__bio_for_each_segment(bv, dst, iter, dst_iter) {
-+		void *dstp = kmap_atomic(bv.bv_page);
++		void *dstp = kmap_local_page(bv.bv_page);
++
 +		memcpy(dstp + bv.bv_offset, src, bv.bv_len);
-+		kunmap_atomic(dstp);
++		kunmap_local(dstp);
 +
 +		src += bv.bv_len;
 +	}
@@ -91339,9 +93613,10 @@ index 000000000..ae4f6de3c
 +	struct bvec_iter iter;
 +
 +	__bio_for_each_segment(bv, src, iter, src_iter) {
-+		void *srcp = kmap_atomic(bv.bv_page);
++		void *srcp = kmap_local_page(bv.bv_page);
++
 +		memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
-+		kunmap_atomic(srcp);
++		kunmap_local(srcp);
 +
 +		dst += bv.bv_len;
 +	}
@@ -91665,10 +93940,10 @@ index 000000000..ae4f6de3c
 +}
 diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
 new file mode 100644
-index 000000000..5fa29dab3
+index 000000000..19cc6bfe9
 --- /dev/null
 +++ b/fs/bcachefs/util.h
-@@ -0,0 +1,846 @@
+@@ -0,0 +1,851 @@
 +/* SPDX-License-Identifier: GPL-2.0 */
 +#ifndef _BCACHEFS_UTIL_H
 +#define _BCACHEFS_UTIL_H
@@ -91731,13 +94006,12 @@ index 000000000..5fa29dab3
 +		free_pages((unsigned long) p, get_order(size));
 +}
 +
-+static inline void *vpmalloc_noprof(size_t size, gfp_t gfp_mask)
++static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
 +{
-+	return (void *) get_free_pages_noprof(gfp_mask|__GFP_NOWARN,
-+					      get_order(size)) ?:
-+		__vmalloc_noprof(size, gfp_mask);
++	return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
++					 get_order(size)) ?:
++		__vmalloc(size, gfp_mask);
 +}
-+#define vpmalloc(_size, _gfp)	alloc_hooks(vpmalloc_noprof(_size, _gfp))
 +
 +static inline void kvpfree(void *p, size_t size)
 +{
@@ -91747,13 +94021,12 @@ index 000000000..5fa29dab3
 +		vpfree(p, size);
 +}
 +
-+static inline void *kvpmalloc_noprof(size_t size, gfp_t gfp_mask)
++static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
 +{
 +	return size < PAGE_SIZE
-+		? kmalloc_noprof(size, gfp_mask)
-+		: vpmalloc_noprof(size, gfp_mask);
++		? kmalloc(size, gfp_mask)
++		: vpmalloc(size, gfp_mask);
 +}
-+#define kvpmalloc(_size, _gfp)	alloc_hooks(kvpmalloc_noprof(_size, _gfp))
 +
 +int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
 +
@@ -92139,8 +94412,10 @@ index 000000000..5fa29dab3
 +	s64			last_change;
 +	s64			last_target;
 +
-+	/* If true, the rate will not increase if bch2_ratelimit_delay()
-+	 * is not being called often enough. */
++	/*
++	 * If true, the rate will not increase if bch2_ratelimit_delay()
++	 * is not being called often enough.
++	 */
 +	bool			backpressure;
 +};
 +
@@ -92203,9 +94478,7 @@ index 000000000..5fa29dab3
 +}
 +
 +void bch2_bio_map(struct bio *bio, void *base, size_t);
-+int bch2_bio_alloc_pages_noprof(struct bio *, size_t, gfp_t);
-+#define bch2_bio_alloc_pages(_bio, _size, _gfp)				\
-+	alloc_hooks(bch2_bio_alloc_pages_noprof(_bio, _size, _gfp))
++int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
 +
 +static inline sector_t bdev_sectors(struct block_device *bdev)
 +{
@@ -92278,6 +94551,7 @@ index 000000000..5fa29dab3
 +{
 +#ifdef CONFIG_X86_64
 +	long d0, d1, d2;
++
 +	asm volatile("rep ; movsq"
 +		     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
 +		     : "0" (u64s), "1" (dst), "2" (src)
@@ -92354,6 +94628,7 @@ index 000000000..5fa29dab3
 +
 +#ifdef CONFIG_X86_64
 +	long d0, d1, d2;
++
 +	asm volatile("std ;\n"
 +		     "rep ; movsq\n"
 +		     "cld ;\n"
@@ -92512,15 +94787,20 @@ index 000000000..5fa29dab3
 +	return cmp_int(l, r);
 +}
 +
++static inline int cmp_le32(__le32 l, __le32 r)
++{
++	return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
++}
++
 +#include <linux/uuid.h>
 +
 +#endif /* _BCACHEFS_UTIL_H */
 diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
 new file mode 100644
-index 000000000..ef030fc02
+index 000000000..2a2ab86ed
 --- /dev/null
 +++ b/fs/bcachefs/varint.c
-@@ -0,0 +1,122 @@
+@@ -0,0 +1,123 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include <linux/bitops.h>
@@ -92582,6 +94862,7 @@ index 000000000..ef030fc02
 +
 +	if (likely(bytes < 9)) {
 +		__le64 v_le = 0;
++
 +		memcpy(&v_le, in, bytes);
 +		v = le64_to_cpu(v_le);
 +		v >>= bytes;
@@ -92731,10 +95012,10 @@ index 000000000..53a694d71
 +#endif /* _VSTRUCTS_H */
 diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
 new file mode 100644
-index 000000000..70f78006d
+index 000000000..6f6b3caf0
 --- /dev/null
 +++ b/fs/bcachefs/xattr.c
-@@ -0,0 +1,648 @@
+@@ -0,0 +1,649 @@
 +// SPDX-License-Identifier: GPL-2.0
 +
 +#include "bcachefs.h"
@@ -93231,7 +95512,8 @@ index 000000000..70f78006d
 +	bool			defined;
 +};
 +
-+static int inode_opt_set_fn(struct bch_inode_info *inode,
++static int inode_opt_set_fn(struct btree_trans *trans,
++			    struct bch_inode_info *inode,
 +			    struct bch_inode_unpacked *bi,
 +			    void *p)
 +{
@@ -93473,10 +95755,10 @@ index 52e6d5fda..dbdafa261 100644
  }
  EXPORT_SYMBOL(d_tmpfile);
 diff --git a/fs/inode.c b/fs/inode.c
-index b9d498032..6bb7646cb 100644
+index 67611a360..968931eb4 100644
 --- a/fs/inode.c
 +++ b/fs/inode.c
-@@ -57,8 +57,23 @@
+@@ -56,8 +56,23 @@
  
  static unsigned int i_hash_mask __read_mostly;
  static unsigned int i_hash_shift __read_mostly;
@@ -93502,7 +95784,7 @@ index b9d498032..6bb7646cb 100644
  
  /*
   * Empty aops. Can be used for the cases where the user does not
-@@ -417,7 +432,7 @@ EXPORT_SYMBOL(address_space_init_once);
+@@ -416,7 +431,7 @@ EXPORT_SYMBOL(address_space_init_once);
  void inode_init_once(struct inode *inode)
  {
  	memset(inode, 0, sizeof(*inode));
@@ -93511,7 +95793,7 @@ index b9d498032..6bb7646cb 100644
  	INIT_LIST_HEAD(&inode->i_devices);
  	INIT_LIST_HEAD(&inode->i_io_list);
  	INIT_LIST_HEAD(&inode->i_wb_list);
-@@ -506,14 +521,15 @@ static inline void inode_sb_list_del(struct inode *inode)
+@@ -505,14 +520,15 @@ static inline void inode_sb_list_del(struct inode *inode)
  	}
  }
  
@@ -93534,7 +95816,7 @@ index b9d498032..6bb7646cb 100644
  }
  
  /**
-@@ -526,13 +542,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
+@@ -525,13 +541,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
   */
  void __insert_inode_hash(struct inode *inode, unsigned long hashval)
  {
@@ -93552,7 +95834,7 @@ index b9d498032..6bb7646cb 100644
  }
  EXPORT_SYMBOL(__insert_inode_hash);
  
-@@ -544,11 +560,44 @@ EXPORT_SYMBOL(__insert_inode_hash);
+@@ -543,11 +559,44 @@ EXPORT_SYMBOL(__insert_inode_hash);
   */
  void __remove_inode_hash(struct inode *inode)
  {
@@ -93602,7 +95884,7 @@ index b9d498032..6bb7646cb 100644
  }
  EXPORT_SYMBOL(__remove_inode_hash);
  
-@@ -897,26 +946,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
+@@ -896,26 +945,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
  	return freed;
  }
  
@@ -93635,7 +95917,7 @@ index b9d498032..6bb7646cb 100644
  			goto repeat;
  		}
  		if (unlikely(inode->i_state & I_CREATING)) {
-@@ -935,19 +986,20 @@ static struct inode *find_inode(struct super_block *sb,
+@@ -934,19 +985,20 @@ static struct inode *find_inode(struct super_block *sb,
   * iget_locked for details.
   */
  static struct inode *find_inode_fast(struct super_block *sb,
@@ -93659,7 +95941,7 @@ index b9d498032..6bb7646cb 100644
  			goto repeat;
  		}
  		if (unlikely(inode->i_state & I_CREATING)) {
-@@ -1197,25 +1249,25 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
+@@ -1196,25 +1248,25 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
   * return it locked, hashed, and with the I_NEW flag set. The file system gets
   * to fill it in before unlocking it via unlock_new_inode().
   *
@@ -93691,7 +95973,7 @@ index b9d498032..6bb7646cb 100644
  		if (IS_ERR(old))
  			return NULL;
  		wait_on_inode(old);
-@@ -1237,7 +1289,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
+@@ -1236,7 +1288,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
  	 */
  	spin_lock(&inode->i_lock);
  	inode->i_state |= I_NEW;
@@ -93700,7 +95982,7 @@ index b9d498032..6bb7646cb 100644
  	spin_unlock(&inode->i_lock);
  
  	/*
-@@ -1247,7 +1299,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
+@@ -1246,7 +1298,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
  	if (list_empty(&inode->i_sb_list))
  		inode_sb_list_add(inode);
  unlock:
@@ -93709,7 +95991,7 @@ index b9d498032..6bb7646cb 100644
  
  	return inode;
  }
-@@ -1308,12 +1360,12 @@ EXPORT_SYMBOL(iget5_locked);
+@@ -1307,12 +1359,12 @@ EXPORT_SYMBOL(iget5_locked);
   */
  struct inode *iget_locked(struct super_block *sb, unsigned long ino)
  {
@@ -93726,7 +96008,7 @@ index b9d498032..6bb7646cb 100644
  	if (inode) {
  		if (IS_ERR(inode))
  			return NULL;
-@@ -1329,17 +1381,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+@@ -1328,17 +1380,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
  	if (inode) {
  		struct inode *old;
  
@@ -93748,7 +96030,7 @@ index b9d498032..6bb7646cb 100644
  
  			/* Return the locked inode with I_NEW set, the
  			 * caller is responsible for filling in the contents
-@@ -1352,7 +1404,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
+@@ -1351,7 +1403,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
  		 * us. Use the old inode instead of the one we just
  		 * allocated.
  		 */
@@ -93757,7 +96039,7 @@ index b9d498032..6bb7646cb 100644
  		destroy_inode(inode);
  		if (IS_ERR(old))
  			return NULL;
-@@ -1376,10 +1428,11 @@ EXPORT_SYMBOL(iget_locked);
+@@ -1375,10 +1427,11 @@ EXPORT_SYMBOL(iget_locked);
   */
  static int test_inode_iunique(struct super_block *sb, unsigned long ino)
  {
@@ -93771,7 +96053,7 @@ index b9d498032..6bb7646cb 100644
  		if (inode->i_ino == ino && inode->i_sb == sb)
  			return 0;
  	}
-@@ -1463,12 +1516,12 @@ EXPORT_SYMBOL(igrab);
+@@ -1462,12 +1515,12 @@ EXPORT_SYMBOL(igrab);
  struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
  		int (*test)(struct inode *, void *), void *data)
  {
@@ -93788,7 +96070,7 @@ index b9d498032..6bb7646cb 100644
  
  	return IS_ERR(inode) ? NULL : inode;
  }
-@@ -1518,12 +1571,12 @@ EXPORT_SYMBOL(ilookup5);
+@@ -1517,12 +1570,12 @@ EXPORT_SYMBOL(ilookup5);
   */
  struct inode *ilookup(struct super_block *sb, unsigned long ino)
  {
@@ -93805,7 +96087,7 @@ index b9d498032..6bb7646cb 100644
  
  	if (inode) {
  		if (IS_ERR(inode))
-@@ -1567,12 +1620,13 @@ struct inode *find_inode_nowait(struct super_block *sb,
+@@ -1566,12 +1619,13 @@ struct inode *find_inode_nowait(struct super_block *sb,
  					     void *),
  				void *data)
  {
@@ -93822,7 +96104,7 @@ index b9d498032..6bb7646cb 100644
  		if (inode->i_sb != sb)
  			continue;
  		mval = match(inode, hashval, data);
-@@ -1583,7 +1637,7 @@ struct inode *find_inode_nowait(struct super_block *sb,
+@@ -1582,7 +1636,7 @@ struct inode *find_inode_nowait(struct super_block *sb,
  		goto out;
  	}
  out:
@@ -93831,7 +96113,7 @@ index b9d498032..6bb7646cb 100644
  	return ret_inode;
  }
  EXPORT_SYMBOL(find_inode_nowait);
-@@ -1612,13 +1666,14 @@ EXPORT_SYMBOL(find_inode_nowait);
+@@ -1611,13 +1665,14 @@ EXPORT_SYMBOL(find_inode_nowait);
  struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
  			     int (*test)(struct inode *, void *), void *data)
  {
@@ -93848,7 +96130,7 @@ index b9d498032..6bb7646cb 100644
  		if (inode->i_sb == sb &&
  		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
  		    test(inode, data))
-@@ -1650,13 +1705,14 @@ EXPORT_SYMBOL(find_inode_rcu);
+@@ -1649,13 +1704,14 @@ EXPORT_SYMBOL(find_inode_rcu);
  struct inode *find_inode_by_ino_rcu(struct super_block *sb,
  				    unsigned long ino)
  {
@@ -93865,7 +96147,7 @@ index b9d498032..6bb7646cb 100644
  		if (inode->i_ino == ino &&
  		    inode->i_sb == sb &&
  		    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
-@@ -1670,39 +1726,42 @@ int insert_inode_locked(struct inode *inode)
+@@ -1669,39 +1725,42 @@ int insert_inode_locked(struct inode *inode)
  {
  	struct super_block *sb = inode->i_sb;
  	ino_t ino = inode->i_ino;
@@ -93921,7 +96203,7 @@ index b9d498032..6bb7646cb 100644
  		wait_on_inode(old);
  		if (unlikely(!inode_unhashed(old))) {
  			iput(old);
-@@ -2227,17 +2286,18 @@ EXPORT_SYMBOL(inode_needs_sync);
+@@ -2226,17 +2285,18 @@ EXPORT_SYMBOL(inode_needs_sync);
   * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
   * will DTRT.
   */
@@ -93943,7 +96225,7 @@ index b9d498032..6bb7646cb 100644
  }
  
  static __initdata unsigned long ihash_entries;
-@@ -2263,7 +2323,7 @@ void __init inode_init_early(void)
+@@ -2262,7 +2322,7 @@ void __init inode_init_early(void)
  
  	inode_hashtable =
  		alloc_large_system_hash("Inode-cache",
@@ -93952,7 +96234,7 @@ index b9d498032..6bb7646cb 100644
  					ihash_entries,
  					14,
  					HASH_EARLY | HASH_ZERO,
-@@ -2289,7 +2349,7 @@ void __init inode_init(void)
+@@ -2288,7 +2348,7 @@ void __init inode_init(void)
  
  	inode_hashtable =
  		alloc_large_system_hash("Inode-cache",
@@ -93962,7 +96244,7 @@ index b9d498032..6bb7646cb 100644
  					14,
  					HASH_ZERO,
 diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
-index 063133ec7..13c40c09d 100644
+index aa8967cca..72d32603f 100644
 --- a/fs/iomap/buffered-io.c
 +++ b/fs/iomap/buffered-io.c
 @@ -292,8 +292,12 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
@@ -94015,7 +96297,7 @@ index 063133ec7..13c40c09d 100644
  
  	bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
  	bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
- 	bio_add_folio(&bio, folio, plen, poff);
+ 	bio_add_folio_nofail(&bio, folio, plen, poff);
 -	return submit_bio_wait(&bio);
 +
 +	if (iomap->flags & IOMAP_F_NOSUBMIT)
@@ -94026,7 +96308,7 @@ index 063133ec7..13c40c09d 100644
  }
  
  static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
-@@ -1486,7 +1503,10 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
+@@ -1489,7 +1506,10 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
  		return error;
  	}
  
@@ -94038,7 +96320,7 @@ index 063133ec7..13c40c09d 100644
  	return 0;
  }
  
-@@ -1524,8 +1544,9 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
+@@ -1527,8 +1547,9 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
   * traversal in iomap_finish_ioend().
   */
  static struct bio *
@@ -94049,7 +96331,7 @@ index 063133ec7..13c40c09d 100644
  	struct bio *new;
  
  	new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
-@@ -1534,7 +1555,11 @@ iomap_chain_bio(struct bio *prev)
+@@ -1537,7 +1558,11 @@ iomap_chain_bio(struct bio *prev)
  
  	bio_chain(prev, new);
  	bio_get(prev);		/* for iomap_finish_ioend */
@@ -94062,80 +96344,15 @@ index 063133ec7..13c40c09d 100644
  	return new;
  }
  
-@@ -1581,7 +1606,7 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
+@@ -1584,7 +1609,7 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
  	}
  
  	if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {
 -		wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
 +		wpc->ioend->io_bio = iomap_chain_bio(wpc);
- 		bio_add_folio(wpc->ioend->io_bio, folio, len, poff);
+ 		bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
  	}
  
-diff --git a/fs/super.c b/fs/super.c
-index 04bc62ab7..a2decce02 100644
---- a/fs/super.c
-+++ b/fs/super.c
-@@ -791,14 +791,7 @@ void iterate_supers_type(struct file_system_type *type,
- 
- EXPORT_SYMBOL(iterate_supers_type);
- 
--/**
-- * get_super - get the superblock of a device
-- * @bdev: device to get the superblock for
-- *
-- * Scans the superblock list and finds the superblock of the file system
-- * mounted on the device given. %NULL is returned if no match is found.
-- */
--struct super_block *get_super(struct block_device *bdev)
-+static struct super_block *__get_super(struct block_device *bdev, bool try)
- {
- 	struct super_block *sb;
- 
-@@ -813,7 +806,12 @@ struct super_block *get_super(struct block_device *bdev)
- 		if (sb->s_bdev == bdev) {
- 			sb->s_count++;
- 			spin_unlock(&sb_lock);
--			down_read(&sb->s_umount);
-+
-+			if (!try)
-+				down_read(&sb->s_umount);
-+			else if (!down_read_trylock(&sb->s_umount))
-+				return NULL;
-+
- 			/* still alive? */
- 			if (sb->s_root && (sb->s_flags & SB_BORN))
- 				return sb;
-@@ -828,6 +826,30 @@ struct super_block *get_super(struct block_device *bdev)
- 	return NULL;
- }
- 
-+/**
-+ * get_super - get the superblock of a device
-+ * @bdev: device to get the superblock for
-+ *
-+ * Scans the superblock list and finds the superblock of the file system
-+ * mounted on the device given. %NULL is returned if no match is found.
-+ */
-+struct super_block *get_super(struct block_device *bdev)
-+{
-+	return __get_super(bdev, false);
-+}
-+
-+/**
-+ * try_get_super - get the superblock of a device, using trylock on sb->s_umount
-+ * @bdev: device to get the superblock for
-+ *
-+ * Scans the superblock list and finds the superblock of the file system
-+ * mounted on the device given. %NULL is returned if no match is found.
-+ */
-+struct super_block *try_get_super(struct block_device *bdev)
-+{
-+	return __get_super(bdev, true);
-+}
-+
- /**
-  * get_active_super - get an active reference to the superblock of a device
-  * @bdev: device to get the superblock for
 diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
 index 18c8f168b..f0003446f 100644
 --- a/fs/xfs/xfs_iomap.c
@@ -94151,7 +96368,7 @@ index 18c8f168b..f0003446f 100644
  		return xfs_alert_fsblock_zero(ip, imap);
  
 diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
-index 6c09f8953..2733c5484 100644
+index e2866e7fa..29ecb5643 100644
 --- a/fs/xfs/xfs_mount.h
 +++ b/fs/xfs/xfs_mount.h
 @@ -284,6 +284,7 @@ typedef struct xfs_mount {
@@ -94171,7 +96388,7 @@ index 6c09f8953..2733c5484 100644
  __XFS_HAS_FEAT(noalign, NOALIGN)
  __XFS_HAS_FEAT(allocsize, ALLOCSIZE)
 diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
-index 4120bd1cb..83a0a043b 100644
+index 818510243..b6cdce43c 100644
 --- a/fs/xfs/xfs_super.c
 +++ b/fs/xfs/xfs_super.c
 @@ -121,7 +121,7 @@ enum {
@@ -94191,7 +96408,7 @@ index 4120bd1cb..83a0a043b 100644
  	{}
  };
  
-@@ -1376,6 +1377,9 @@ xfs_fs_parse_param(
+@@ -1396,6 +1397,9 @@ xfs_fs_parse_param(
  		xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
  		parsing_mp->m_features |= XFS_FEAT_NOATTR2;
  		return 0;
@@ -94201,219 +96418,11 @@ index 4120bd1cb..83a0a043b 100644
  	default:
  		xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
  		return -EINVAL;
-diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h
-new file mode 100644
-index 000000000..16fbf74ed
---- /dev/null
-+++ b/include/asm-generic/codetag.lds.h
-@@ -0,0 +1,15 @@
-+/* SPDX-License-Identifier: GPL-2.0-only */
-+#ifndef __ASM_GENERIC_CODETAG_LDS_H
-+#define __ASM_GENERIC_CODETAG_LDS_H
-+
-+#define SECTION_WITH_BOUNDARIES(_name)	\
-+	. = ALIGN(8);			\
-+	__start_##_name = .;		\
-+	KEEP(*(_name))			\
-+	__stop_##_name = .;
-+
-+#define CODETAG_SECTIONS()		\
-+	SECTION_WITH_BOUNDARIES(alloc_tags)		\
-+	SECTION_WITH_BOUNDARIES(dynamic_fault_tags)
-+
-+#endif /* __ASM_GENERIC_CODETAG_LDS_H */
-diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
-index da9e5629e..47dd57ca7 100644
---- a/include/asm-generic/vmlinux.lds.h
-+++ b/include/asm-generic/vmlinux.lds.h
-@@ -50,6 +50,8 @@
-  *               [__nosave_begin, __nosave_end] for the nosave data
-  */
- 
-+#include <asm-generic/codetag.lds.h>
-+
- #ifndef LOAD_OFFSET
- #define LOAD_OFFSET 0
- #endif
-@@ -374,6 +376,7 @@
- 	. = ALIGN(8);							\
- 	BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes)		\
- 	BOUNDED_SECTION_BY(__dyndbg, ___dyndbg)				\
-+	CODETAG_SECTIONS()						\
- 	LIKELY_PROFILE()		       				\
- 	BRANCH_PROFILE()						\
- 	TRACE_PRINTKS()							\
-diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
-new file mode 100644
-index 000000000..6c1b7e1dc
---- /dev/null
-+++ b/include/linux/alloc_tag.h
-@@ -0,0 +1,160 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * allocation tagging
-+ */
-+#ifndef _LINUX_ALLOC_TAG_H
-+#define _LINUX_ALLOC_TAG_H
-+
-+#include <linux/bug.h>
-+#include <linux/codetag.h>
-+#include <linux/container_of.h>
-+#include <asm/percpu.h>
-+#include <linux/cpumask.h>
-+#include <linux/dynamic_fault.h>
-+#include <linux/static_key.h>
-+
-+/*
-+ * An instance of this structure is created in a special ELF section at every
-+ * allocation callsite. At runtime, the special section is treated as
-+ * an array of these. Embedded codetag utilizes codetag framework.
-+ */
-+struct alloc_tag {
-+	struct codetag			ct;
-+	u64 __percpu			*bytes_allocated;
-+} __aligned(8);
-+
-+#ifdef CONFIG_MEM_ALLOC_PROFILING
-+
-+void alloc_tags_show_mem_report(struct seq_buf *s);
-+
-+static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
-+{
-+	return container_of(ct, struct alloc_tag, ct);
-+}
-+
-+#define DEFINE_ALLOC_TAG(_alloc_tag, _old)				\
-+	static struct alloc_tag _alloc_tag __used __aligned(8)		\
-+	__section("alloc_tags") = { .ct = CODE_TAG_INIT };		\
-+	struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag)
-+
-+DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
-+			mem_alloc_profiling_key);
-+
-+static inline bool mem_alloc_profiling_enabled(void)
-+{
-+	return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
-+				   &mem_alloc_profiling_key);
-+}
-+
-+static inline u64 alloc_tag_read(struct alloc_tag *tag)
-+{
-+	u64 v = 0;
-+	int cpu;
-+
-+	for_each_possible_cpu(cpu)
-+		v += *per_cpu_ptr(tag->bytes_allocated, cpu);
-+
-+	return v;
-+}
-+
-+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
-+
-+#define CODETAG_EMPTY	(void *)1
-+
-+static inline bool is_codetag_empty(union codetag_ref *ref)
-+{
-+	return ref->ct == CODETAG_EMPTY;
-+}
-+
-+static inline void set_codetag_empty(union codetag_ref *ref)
-+{
-+	if (ref)
-+		ref->ct = CODETAG_EMPTY;
-+}
-+
-+#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
-+
-+static inline bool is_codetag_empty(union codetag_ref *ref) { return false; }
-+static inline void set_codetag_empty(union codetag_ref *ref) {}
-+
-+#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
-+
-+static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes)
-+{
-+	struct alloc_tag *tag;
-+
-+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
-+	WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
-+#endif
-+	if (!ref || !ref->ct)
-+		return;
-+
-+	if (is_codetag_empty(ref)) {
-+		ref->ct = NULL;
-+		return;
-+	}
-+
-+	tag = ct_to_alloc_tag(ref->ct);
-+
-+	this_cpu_add(*tag->bytes_allocated, -bytes);
-+	ref->ct = NULL;
-+}
-+
-+static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
-+{
-+	__alloc_tag_sub(ref, bytes);
-+}
-+
-+static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes)
-+{
-+	__alloc_tag_sub(ref, bytes);
-+}
-+
-+static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
-+{
-+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
-+	WARN_ONCE(ref && ref->ct,
-+		  "alloc_tag was not cleared (got tag for %s:%u)\n",\
-+		  ref->ct->filename, ref->ct->lineno);
-+
-+	WARN_ONCE(!tag, "current->alloc_tag not set");
-+#endif
-+	if (!ref || !tag)
-+		return;
-+
-+	ref->ct = &tag->ct;
-+	this_cpu_add(*tag->bytes_allocated, bytes);
-+}
-+
-+#else
-+
-+#define DEFINE_ALLOC_TAG(_alloc_tag, _old)
-+static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
-+static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {}
-+static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
-+				 size_t bytes) {}
-+static inline void set_codetag_empty(union codetag_ref *ref) {}
-+
-+#endif
-+
-+typedef struct mempool_s mempool_t;
-+
-+#define res_type_to_err(_res)	_Generic((_res),			\
-+	struct folio *:		NULL,					\
-+	struct page *:		NULL,					\
-+	mempool_t *:		NULL,					\
-+	void *:			NULL,					\
-+	unsigned long:		0,					\
-+	int:			-ENOMEM)
-+
-+#define alloc_hooks(_do_alloc)						\
-+({									\
-+	typeof(_do_alloc) _res;						\
-+	DEFINE_ALLOC_TAG(_alloc_tag, _old);				\
-+									\
-+	_res = !memory_fault() ? _do_alloc : res_type_to_err(_res);	\
-+	alloc_tag_restore(&_alloc_tag, _old);				\
-+	_res;								\
-+})
-+
-+#endif /* _LINUX_ALLOC_TAG_H */
 diff --git a/include/linux/bio.h b/include/linux/bio.h
-index b3e7529ff..f2620f8d1 100644
+index 11984ed29..debbd8fcb 100644
 --- a/include/linux/bio.h
 +++ b/include/linux/bio.h
-@@ -484,7 +484,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
+@@ -488,7 +488,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
  extern void bio_copy_data(struct bio *dst, struct bio *src);
  extern void bio_free_pages(struct bio *bio);
  void guard_bio_eod(struct bio *bio);
@@ -94428,10 +96437,10 @@ index b3e7529ff..f2620f8d1 100644
  static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
  {
 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
-index 67e942d77..10d30c0bc 100644
+index 87d94be78..61ffaaba4 100644
 --- a/include/linux/blkdev.h
 +++ b/include/linux/blkdev.h
-@@ -855,6 +855,7 @@ extern const char *blk_op_str(enum req_op op);
+@@ -846,6 +846,7 @@ extern const char *blk_op_str(enum req_op op);
  
  int blk_status_to_errno(blk_status_t status);
  blk_status_t errno_to_blk_status(int errno);
@@ -94567,122 +96576,6 @@ index c88cdc4ae..722a586bb 100644
 +} while (0)
 +
  #endif /* _LINUX_CLOSURE_H */
-diff --git a/include/linux/codetag.h b/include/linux/codetag.h
-new file mode 100644
-index 000000000..87207f199
---- /dev/null
-+++ b/include/linux/codetag.h
-@@ -0,0 +1,110 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * code tagging framework
-+ */
-+#ifndef _LINUX_CODETAG_H
-+#define _LINUX_CODETAG_H
-+
-+#include <linux/types.h>
-+
-+struct codetag_iterator;
-+struct codetag_type;
-+struct seq_buf;
-+struct module;
-+
-+/*
-+ * An instance of this structure is created in a special ELF section at every
-+ * code location being tagged.  At runtime, the special section is treated as
-+ * an array of these.
-+ */
-+struct codetag {
-+	unsigned int flags; /* used in later patches */
-+	unsigned int lineno;
-+	const char *modname;
-+	const char *function;
-+	const char *filename;
-+} __aligned(8);
-+
-+union codetag_ref {
-+	struct codetag *ct;
-+};
-+
-+struct codetag_range {
-+	struct codetag *start;
-+	struct codetag *stop;
-+};
-+
-+struct codetag_module {
-+	struct module *mod;
-+	struct codetag_range range;
-+};
-+
-+struct codetag_type_desc {
-+	const char *section;
-+	size_t tag_size;
-+	void (*module_load)(struct codetag_type *cttype,
-+			    struct codetag_module *cmod);
-+	bool (*module_unload)(struct codetag_type *cttype,
-+			      struct codetag_module *cmod);
-+};
-+
-+struct codetag_iterator {
-+	struct codetag_type *cttype;
-+	struct codetag_module *cmod;
-+	unsigned long mod_id;
-+	struct codetag *ct;
-+};
-+
-+#define CODE_TAG_INIT {					\
-+	.modname	= KBUILD_MODNAME,		\
-+	.function	= __func__,			\
-+	.filename	= __FILE__,			\
-+	.lineno		= __LINE__,			\
-+	.flags		= 0,				\
-+}
-+
-+void codetag_lock_module_list(struct codetag_type *cttype, bool lock);
-+struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype);
-+struct codetag *codetag_next_ct(struct codetag_iterator *iter);
-+
-+void codetag_to_text(struct seq_buf *out, struct codetag *ct);
-+
-+struct codetag_type *
-+codetag_register_type(const struct codetag_type_desc *desc);
-+
-+#ifdef CONFIG_CODE_TAGGING
-+void codetag_load_module(struct module *mod);
-+bool codetag_unload_module(struct module *mod);
-+#else
-+static inline void codetag_load_module(struct module *mod) {}
-+static inline bool codetag_unload_module(struct module *mod) { return true; }
-+#endif
-+
-+/* Codetag query parsing */
-+
-+struct codetag_query {
-+	const char	*filename;
-+	const char	*module;
-+	const char	*function;
-+	const char	*class;
-+	unsigned int	first_line, last_line;
-+	unsigned int	first_index, last_index;
-+	unsigned int	cur_index;
-+
-+	bool		match_line:1;
-+	bool		match_index:1;
-+
-+	unsigned int	set_enabled:1;
-+	unsigned int	enabled:2;
-+
-+	unsigned int	set_frequency:1;
-+	unsigned int	frequency;
-+};
-+
-+char *codetag_query_parse(struct codetag_query *q, char *buf);
-+bool codetag_matches_query(struct codetag_query *q,
-+			   const struct codetag *ct,
-+			   const struct codetag_module *mod,
-+			   const char *class);
-+
-+#endif /* _LINUX_CODETAG_H */
 diff --git a/include/linux/dcache.h b/include/linux/dcache.h
 index 6b351e009..3da2f0545 100644
 --- a/include/linux/dcache.h
@@ -94695,106 +96588,8 @@ index 6b351e009..3da2f0545 100644
  extern void d_tmpfile(struct file *, struct inode *);
  
  extern struct dentry *d_find_alias(struct inode *);
-diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
-index 31f114f48..d741940dc 100644
---- a/include/linux/dma-map-ops.h
-+++ b/include/linux/dma-map-ops.h
-@@ -27,7 +27,7 @@ struct dma_map_ops {
- 			unsigned long attrs);
- 	void (*free)(struct device *dev, size_t size, void *vaddr,
- 			dma_addr_t dma_handle, unsigned long attrs);
--	struct page *(*alloc_pages)(struct device *dev, size_t size,
-+	struct page *(*alloc_pages_op)(struct device *dev, size_t size,
- 			dma_addr_t *dma_handle, enum dma_data_direction dir,
- 			gfp_t gfp);
- 	void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
-diff --git a/include/linux/dynamic_fault.h b/include/linux/dynamic_fault.h
-new file mode 100644
-index 000000000..526a33209
---- /dev/null
-+++ b/include/linux/dynamic_fault.h
-@@ -0,0 +1,79 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+
-+#ifndef _LINUX_DYNAMIC_FAULT_H
-+#define _LINUX_DYNAMIC_FAULT_H
-+
-+/*
-+ * Dynamic/code tagging fault injection:
-+ *
-+ * Originally based on the dynamic debug trick of putting types in a special elf
-+ * section, then rewritten using code tagging:
-+ *
-+ * To use, simply insert a call to dynamic_fault("fault_class"), which will
-+ * return true if an error should be injected.
-+ *
-+ * Fault injection sites may be listed and enabled via debugfs, under
-+ * /sys/kernel/debug/dynamic_faults.
-+ */
-+
-+#ifdef CONFIG_CODETAG_FAULT_INJECTION
-+
-+#include <linux/codetag.h>
-+#include <linux/jump_label.h>
-+
-+#define DFAULT_STATES()		\
-+	x(disabled)		\
-+	x(enabled)		\
-+	x(oneshot)
-+
-+enum dfault_enabled {
-+#define x(n)	DFAULT_##n,
-+	DFAULT_STATES()
-+#undef x
-+};
-+
-+union dfault_state {
-+	struct {
-+		unsigned int		enabled:2;
-+		unsigned int		count:30;
-+	};
-+
-+	struct {
-+		unsigned int		v;
-+	};
-+};
-+
-+struct dfault {
-+	struct codetag		tag;
-+	const char		*class;
-+	unsigned int		frequency;
-+	union dfault_state	state;
-+	struct static_key_false	enabled;
-+};
-+
-+bool __dynamic_fault_enabled(struct dfault *df);
-+
-+#define dynamic_fault(_class)				\
-+({							\
-+	static struct dfault				\
-+	__used						\
-+	__section("dynamic_fault_tags")			\
-+	__aligned(8) df = {				\
-+		.tag	= CODE_TAG_INIT,		\
-+		.class	= _class,			\
-+		.enabled = STATIC_KEY_FALSE_INIT,	\
-+	};						\
-+							\
-+	static_key_false(&df.enabled.key) &&		\
-+		__dynamic_fault_enabled(&df);		\
-+})
-+
-+#else
-+
-+#define dynamic_fault(_class)	false
-+
-+#endif /* CODETAG_FAULT_INJECTION */
-+
-+#define memory_fault()		dynamic_fault("memory")
-+
-+#endif /* _LINUX_DYNAMIC_FAULT_H */
 diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
-index 9edb29101..4bf7c8466 100644
+index 11fbd0ee1..f49a7d311 100644
 --- a/include/linux/exportfs.h
 +++ b/include/linux/exportfs.h
 @@ -98,6 +98,12 @@ enum fid_type {
@@ -94810,35 +96605,11 @@ index 9edb29101..4bf7c8466 100644
  	/*
  	 * 128 bit child FID (struct lu_fid)
  	 * 128 bit parent FID (struct lu_fid)
-diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
-index c9de1f59e..6f36fff09 100644
---- a/include/linux/fortify-string.h
-+++ b/include/linux/fortify-string.h
-@@ -689,9 +689,9 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size)
- 	return __real_memchr_inv(p, c, size);
- }
- 
--extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup)
-+extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup_noprof)
- 								    __realloc_size(2);
--__FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp)
-+__FORTIFY_INLINE void *kmemdup_noprof(const void * const POS0 p, size_t size, gfp_t gfp)
- {
- 	size_t p_size = __struct_size(p);
- 
-@@ -701,6 +701,7 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp
- 		fortify_panic(__func__);
- 	return __real_kmemdup(p, size, gfp);
- }
-+#define kmemdup(...)	alloc_hooks(kmemdup_noprof(__VA_ARGS__))
- 
- /**
-  * strcpy - Copy a string into another string buffer
 diff --git a/include/linux/fs.h b/include/linux/fs.h
-index 133f0640f..f04872975 100644
+index 562f2623c..810fa0812 100644
 --- a/include/linux/fs.h
 +++ b/include/linux/fs.h
-@@ -664,7 +664,8 @@ struct inode {
+@@ -660,7 +660,8 @@ struct inode {
  	unsigned long		dirtied_when;	/* jiffies of first dirtying */
  	unsigned long		dirtied_time_when;
  
@@ -94848,7 +96619,7 @@ index 133f0640f..f04872975 100644
  	struct list_head	i_io_list;	/* backing dev IO list */
  #ifdef CONFIG_CGROUP_WRITEBACK
  	struct bdi_writeback	*i_wb;		/* the associated cgroup wb */
-@@ -730,7 +731,7 @@ static inline unsigned int i_blocksize(const struct inode *node)
+@@ -726,7 +727,7 @@ static inline unsigned int i_blocksize(const struct inode *node)
  
  static inline int inode_unhashed(struct inode *inode)
  {
@@ -94857,7 +96628,7 @@ index 133f0640f..f04872975 100644
  }
  
  /*
-@@ -741,7 +742,7 @@ static inline int inode_unhashed(struct inode *inode)
+@@ -737,7 +738,7 @@ static inline int inode_unhashed(struct inode *inode)
   */
  static inline void inode_fake_hash(struct inode *inode)
  {
@@ -94866,7 +96637,7 @@ index 133f0640f..f04872975 100644
  }
  
  /*
-@@ -2699,11 +2700,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap,
+@@ -2729,11 +2730,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap,
   * This must be used for allocating filesystems specific inodes to set
   * up the inode reclaim context correctly.
   */
@@ -94879,7 +96650,7 @@ index 133f0640f..f04872975 100644
  
  extern void __insert_inode_hash(struct inode *, unsigned long hashval);
  static inline void insert_inode_hash(struct inode *inode)
-@@ -2714,7 +2711,7 @@ static inline void insert_inode_hash(struct inode *inode)
+@@ -2744,7 +2741,7 @@ static inline void insert_inode_hash(struct inode *inode)
  extern void __remove_inode_hash(struct inode *);
  static inline void remove_inode_hash(struct inode *inode)
  {
@@ -94888,14 +96659,6 @@ index 133f0640f..f04872975 100644
  		__remove_inode_hash(inode);
  }
  
-@@ -2897,6 +2894,7 @@ extern struct file_system_type *get_filesystem(struct file_system_type *fs);
- extern void put_filesystem(struct file_system_type *fs);
- extern struct file_system_type *get_fs_type(const char *name);
- extern struct super_block *get_super(struct block_device *);
-+extern struct super_block *try_get_super(struct block_device *);
- extern struct super_block *get_active_super(struct block_device *bdev);
- extern void drop_super(struct super_block *sb);
- extern void drop_super_exclusive(struct super_block *sb);
 diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
 index 107613f7d..c74b73769 100644
 --- a/include/linux/generic-radix-tree.h
@@ -95007,220 +96770,11 @@ index 107613f7d..c74b73769 100644
  int __genradix_prealloc(struct __genradix *, size_t, gfp_t);
  
  /**
-diff --git a/include/linux/gfp.h b/include/linux/gfp.h
-index ed8cb537c..495745c99 100644
---- a/include/linux/gfp.h
-+++ b/include/linux/gfp.h
-@@ -6,6 +6,8 @@
- 
- #include <linux/mmzone.h>
- #include <linux/topology.h>
-+#include <linux/alloc_tag.h>
-+#include <linux/sched.h>
- 
- struct vm_area_struct;
- 
-@@ -174,42 +176,43 @@ static inline void arch_free_page(struct page *page, int order) { }
- static inline void arch_alloc_page(struct page *page, int order) { }
- #endif
- 
--struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
-+struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
- 		nodemask_t *nodemask);
--struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
-+#define __alloc_pages(...)			alloc_hooks(__alloc_pages_noprof(__VA_ARGS__))
-+
-+struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
- 		nodemask_t *nodemask);
-+#define __folio_alloc(...)			alloc_hooks(__folio_alloc_noprof(__VA_ARGS__))
- 
--unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
-+unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
- 				nodemask_t *nodemask, int nr_pages,
- 				struct list_head *page_list,
- 				struct page **page_array);
-+#define __alloc_pages_bulk(...)			alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__))
- 
--unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
-+unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
- 				unsigned long nr_pages,
- 				struct page **page_array);
-+#define  alloc_pages_bulk_array_mempolicy(...)	alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__))
- 
- /* Bulk allocate order-0 pages */
--static inline unsigned long
--alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
--{
--	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL);
--}
-+#define alloc_pages_bulk_list(_gfp, _nr_pages, _list)			\
-+	__alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL)
- 
--static inline unsigned long
--alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array)
--{
--	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array);
--}
-+#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array)		\
-+	__alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array)
- 
- static inline unsigned long
--alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array)
-+alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array)
- {
- 	if (nid == NUMA_NO_NODE)
- 		nid = numa_mem_id();
- 
--	return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array);
-+	return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array);
- }
- 
-+#define alloc_pages_bulk_array_node(...)	alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__))
-+
- static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
- {
- 	gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN);
-@@ -229,21 +232,23 @@ static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
-  * online. For more general interface, see alloc_pages_node().
-  */
- static inline struct page *
--__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
-+__alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order)
- {
- 	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
- 	warn_if_node_offline(nid, gfp_mask);
- 
--	return __alloc_pages(gfp_mask, order, nid, NULL);
-+	return __alloc_pages_noprof(gfp_mask, order, nid, NULL);
- }
- 
-+#define  __alloc_pages_node(...)		alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__))
-+
- static inline
- struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid)
- {
- 	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
- 	warn_if_node_offline(nid, gfp);
- 
--	return __folio_alloc(gfp, order, nid, NULL);
-+	return __folio_alloc_noprof(gfp, order, nid, NULL);
- }
- 
- /*
-@@ -251,53 +256,69 @@ struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid)
-  * prefer the current CPU's closest node. Otherwise node must be valid and
-  * online.
-  */
--static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
--						unsigned int order)
-+static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask,
-+						   unsigned int order)
- {
- 	if (nid == NUMA_NO_NODE)
- 		nid = numa_mem_id();
- 
--	return __alloc_pages_node(nid, gfp_mask, order);
-+	return __alloc_pages_node_noprof(nid, gfp_mask, order);
- }
- 
-+#define  alloc_pages_node(...)			alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__))
-+
- #ifdef CONFIG_NUMA
--struct page *alloc_pages(gfp_t gfp, unsigned int order);
--struct folio *folio_alloc(gfp_t gfp, unsigned order);
--struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
-+struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order);
-+struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order);
-+struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
- 		unsigned long addr, bool hugepage);
- #else
--static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
-+static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order)
- {
--	return alloc_pages_node(numa_node_id(), gfp_mask, order);
-+	return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order);
- }
--static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
-+static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
- {
- 	return __folio_alloc_node(gfp, order, numa_node_id());
- }
--#define vma_alloc_folio(gfp, order, vma, addr, hugepage)		\
--	folio_alloc(gfp, order)
-+#define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage)		\
-+	folio_alloc_noprof(gfp, order)
- #endif
-+
-+#define alloc_pages(...)			alloc_hooks(alloc_pages_noprof(__VA_ARGS__))
-+#define folio_alloc(...)			alloc_hooks(folio_alloc_noprof(__VA_ARGS__))
-+#define vma_alloc_folio(...)			alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__))
-+
- #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
--static inline struct page *alloc_page_vma(gfp_t gfp,
-+
-+static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
- 		struct vm_area_struct *vma, unsigned long addr)
- {
--	struct folio *folio = vma_alloc_folio(gfp, 0, vma, addr, false);
-+	struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr, false);
- 
- 	return &folio->page;
- }
-+#define alloc_page_vma(...)			alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))
-+
-+extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
-+#define __get_free_pages(...)			alloc_hooks(get_free_pages_noprof(__VA_ARGS__))
- 
--extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
--extern unsigned long get_zeroed_page(gfp_t gfp_mask);
-+extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask);
-+#define get_zeroed_page(...)			alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__))
-+
-+void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1);
-+#define alloc_pages_exact(...)			alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__))
- 
--void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1);
- void free_pages_exact(void *virt, size_t size);
--__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2);
- 
--#define __get_free_page(gfp_mask) \
--		__get_free_pages((gfp_mask), 0)
-+__meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2);
-+#define alloc_pages_exact_nid(...)		alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__))
-+
-+#define __get_free_page(gfp_mask)					\
-+	__get_free_pages((gfp_mask), 0)
- 
--#define __get_dma_pages(gfp_mask, order) \
--		__get_free_pages((gfp_mask) | GFP_DMA, (order))
-+#define __get_dma_pages(gfp_mask, order)				\
-+	__get_free_pages((gfp_mask) | GFP_DMA, (order))
- 
- extern void __free_pages(struct page *page, unsigned int order);
- extern void free_pages(unsigned long addr, unsigned int order);
-@@ -354,10 +375,14 @@ static inline bool pm_suspended_storage(void)
- 
- #ifdef CONFIG_CONTIG_ALLOC
- /* The below functions must be run on a range from a single zone. */
--extern int alloc_contig_range(unsigned long start, unsigned long end,
-+extern int alloc_contig_range_noprof(unsigned long start, unsigned long end,
- 			      unsigned migratetype, gfp_t gfp_mask);
--extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
--				       int nid, nodemask_t *nodemask);
-+#define alloc_contig_range(...)			alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__))
-+
-+extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
-+					      int nid, nodemask_t *nodemask);
-+#define alloc_contig_pages(...)			alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__))
-+
- #endif
- void free_contig_range(unsigned long pfn, unsigned long nr_pages);
- 
 diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
-index 6583a5867..1c6573d69 100644
+index 6583a5867..3fbe62476 100644
 --- a/include/linux/gfp_types.h
 +++ b/include/linux/gfp_types.h
-@@ -21,44 +21,86 @@ typedef unsigned int __bitwise gfp_t;
+@@ -21,44 +21,78 @@ typedef unsigned int __bitwise gfp_t;
   * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c
   */
  
@@ -95256,9 +96810,6 @@ index 6583a5867..1c6573d69 100644
 +#ifdef CONFIG_LOCKDEP
 +	___GFP_NOLOCKDEP_BIT,
 +#endif
-+#ifdef CONFIG_SLAB_OBJ_EXT
-+	___GFP_NO_OBJ_EXT_BIT,
-+#endif
 +	___GFP_LAST_BIT
 +};
 +
@@ -95326,31 +96877,10 @@ index 6583a5867..1c6573d69 100644
  #define ___GFP_NOLOCKDEP	0
  #endif
 -/* If the above are modified, __GFP_BITS_SHIFT may need updating */
-+#ifdef CONFIG_SLAB_OBJ_EXT
-+#define ___GFP_NO_OBJ_EXT       BIT(___GFP_NO_OBJ_EXT_BIT)
-+#else
-+#define ___GFP_NO_OBJ_EXT       0
-+#endif
  
  /*
   * Physical address zone modifiers (see linux/mmzone.h - low four bits)
-@@ -99,12 +141,15 @@ typedef unsigned int __bitwise gfp_t;
-  * node with no fallbacks or placement policy enforcements.
-  *
-  * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
-+ *
-+ * %__GFP_NO_OBJ_EXT causes slab allocation to have no object extension.
-  */
- #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
- #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)
- #define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL)
- #define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)
- #define __GFP_ACCOUNT	((__force gfp_t)___GFP_ACCOUNT)
-+#define __GFP_NO_OBJ_EXT   ((__force gfp_t)___GFP_NO_OBJ_EXT)
- 
- /**
-  * DOC: Watermark modifiers
-@@ -249,7 +294,7 @@ typedef unsigned int __bitwise gfp_t;
+@@ -249,7 +283,7 @@ typedef unsigned int __bitwise gfp_t;
  #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
  
  /* Room for N __GFP_FOO bits */
@@ -95359,19 +96889,6 @@ index 6583a5867..1c6573d69 100644
  #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
  
  /**
-diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
-index 0ee140176..e67349e84 100644
---- a/include/linux/hrtimer.h
-+++ b/include/linux/hrtimer.h
-@@ -16,7 +16,7 @@
- #include <linux/rbtree.h>
- #include <linux/init.h>
- #include <linux/list.h>
--#include <linux/percpu.h>
-+#include <linux/percpu-defs.h>
- #include <linux/seqlock.h>
- #include <linux/timer.h>
- #include <linux/timerqueue.h>
 diff --git a/include/linux/iomap.h b/include/linux/iomap.h
 index e2b836c2e..a774d074b 100644
 --- a/include/linux/iomap.h
@@ -95418,7 +96935,7 @@ index ae1b54144..8ee2bf5af 100644
  {
  	bit_spin_lock(0, (unsigned long *)b);
 diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
-index 74bd269a8..3bb30499d 100644
+index 310f85903..2fdfd9129 100644
 --- a/include/linux/lockdep.h
 +++ b/include/linux/lockdep.h
 @@ -344,6 +344,8 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
@@ -95439,7 +96956,7 @@ index 74bd269a8..3bb30499d 100644
  #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type)	\
  	struct lockdep_map __maybe_unused _name = {}
  
-@@ -681,4 +685,10 @@ lockdep_rcu_suspicious(const char *file, const int line, const char *s)
+@@ -689,4 +693,10 @@ lockdep_rcu_suspicious(const char *file, const int line, const char *s)
  }
  #endif
  
@@ -95451,10 +96968,10 @@ index 74bd269a8..3bb30499d 100644
 +
  #endif /* __LINUX_LOCKDEP_H */
 diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
-index 59f4fb162..f90c779e4 100644
+index 2ebc323d3..aa6bddac2 100644
 --- a/include/linux/lockdep_types.h
 +++ b/include/linux/lockdep_types.h
-@@ -129,7 +129,7 @@ struct lock_class {
+@@ -137,7 +137,7 @@ struct lock_class {
  	u8				wait_type_inner;
  	u8				wait_type_outer;
  	u8				lock_type;
@@ -95667,306 +97184,8 @@ index 000000000..647505010
 +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
 +
 +#endif // MEAN_AND_VAIRANCE_H_
-diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
-index 222d73701..3eb8975c1 100644
---- a/include/linux/memcontrol.h
-+++ b/include/linux/memcontrol.h
-@@ -339,15 +339,32 @@ struct mem_cgroup {
- extern struct mem_cgroup *root_mem_cgroup;
- 
- enum page_memcg_data_flags {
--	/* page->memcg_data is a pointer to an objcgs vector */
--	MEMCG_DATA_OBJCGS = (1UL << 0),
-+	/* page->memcg_data is a pointer to an slabobj_ext vector */
-+	MEMCG_DATA_OBJEXTS = (1UL << 0),
- 	/* page has been accounted as a non-slab kernel page */
- 	MEMCG_DATA_KMEM = (1UL << 1),
- 	/* the next bit after the last actual flag */
- 	__NR_MEMCG_DATA_FLAGS  = (1UL << 2),
- };
- 
--#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
-+#define __FIRST_OBJEXT_FLAG	__NR_MEMCG_DATA_FLAGS
-+
-+#else /* CONFIG_MEMCG */
-+
-+#define __FIRST_OBJEXT_FLAG	(1UL << 0)
-+
-+#endif /* CONFIG_MEMCG */
-+
-+enum objext_flags {
-+	/* slabobj_ext vector failed to allocate */
-+	OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG,
-+	/* the next bit after the last actual flag */
-+	__NR_OBJEXTS_FLAGS  = (__FIRST_OBJEXT_FLAG << 1),
-+};
-+
-+#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1)
-+
-+#ifdef CONFIG_MEMCG
- 
- static inline bool folio_memcg_kmem(struct folio *folio);
- 
-@@ -378,10 +395,10 @@ static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
- 	unsigned long memcg_data = folio->memcg_data;
- 
- 	VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
--	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
-+	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
- 	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
- 
--	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
-+	return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
- }
- 
- /*
-@@ -399,10 +416,10 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
- 	unsigned long memcg_data = folio->memcg_data;
- 
- 	VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
--	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
-+	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
- 	VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
- 
--	return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
-+	return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
- }
- 
- /*
-@@ -459,11 +476,11 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
- 	if (memcg_data & MEMCG_DATA_KMEM) {
- 		struct obj_cgroup *objcg;
- 
--		objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
-+		objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
- 		return obj_cgroup_memcg(objcg);
- 	}
- 
--	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
-+	return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
- }
- 
- /*
-@@ -496,17 +513,17 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
- 	 */
- 	unsigned long memcg_data = READ_ONCE(folio->memcg_data);
- 
--	if (memcg_data & MEMCG_DATA_OBJCGS)
-+	if (memcg_data & MEMCG_DATA_OBJEXTS)
- 		return NULL;
- 
- 	if (memcg_data & MEMCG_DATA_KMEM) {
- 		struct obj_cgroup *objcg;
- 
--		objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
-+		objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
- 		return obj_cgroup_memcg(objcg);
- 	}
- 
--	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
-+	return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
- }
- 
- static inline struct mem_cgroup *page_memcg_check(struct page *page)
-@@ -542,7 +559,7 @@ static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *ob
- static inline bool folio_memcg_kmem(struct folio *folio)
- {
- 	VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page);
--	VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio);
-+	VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio);
- 	return folio->memcg_data & MEMCG_DATA_KMEM;
- }
- 
-@@ -1606,6 +1623,19 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
- }
- #endif /* CONFIG_MEMCG */
- 
-+/*
-+ * Extended information for slab objects stored as an array in page->memcg_data
-+ * if MEMCG_DATA_OBJEXTS is set.
-+ */
-+struct slabobj_ext {
-+#ifdef CONFIG_MEMCG_KMEM
-+	struct obj_cgroup *objcg;
-+#endif
-+#ifdef CONFIG_MEM_ALLOC_PROFILING
-+	union codetag_ref ref;
-+#endif
-+} __aligned(8);
-+
- static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
- {
- 	__mod_lruvec_kmem_state(p, idx, 1);
-diff --git a/include/linux/mempool.h b/include/linux/mempool.h
-index 4aae6c06c..9fa126aa1 100644
---- a/include/linux/mempool.h
-+++ b/include/linux/mempool.h
-@@ -5,6 +5,8 @@
- #ifndef _LINUX_MEMPOOL_H
- #define _LINUX_MEMPOOL_H
- 
-+#include <linux/sched.h>
-+#include <linux/alloc_tag.h>
- #include <linux/wait.h>
- #include <linux/compiler.h>
- 
-@@ -39,18 +41,32 @@ void mempool_exit(mempool_t *pool);
- int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
- 		      mempool_free_t *free_fn, void *pool_data,
- 		      gfp_t gfp_mask, int node_id);
--int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
-+
-+int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
- 		 mempool_free_t *free_fn, void *pool_data);
-+#define mempool_init(...)						\
-+	alloc_hooks(mempool_init_noprof(__VA_ARGS__))
- 
- extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
- 			mempool_free_t *free_fn, void *pool_data);
--extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
-+
-+extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn,
- 			mempool_free_t *free_fn, void *pool_data,
- 			gfp_t gfp_mask, int nid);
-+#define mempool_create_node(...)					\
-+	alloc_hooks(mempool_create_node_noprof(__VA_ARGS__))
-+
-+#define mempool_create(_min_nr, _alloc_fn, _free_fn, _pool_data)	\
-+	mempool_create_node(_min_nr, _alloc_fn, _free_fn, _pool_data,	\
-+			    GFP_KERNEL, NUMA_NO_NODE)
- 
- extern int mempool_resize(mempool_t *pool, int new_min_nr);
- extern void mempool_destroy(mempool_t *pool);
--extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
-+
-+extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc;
-+#define mempool_alloc(...)						\
-+	alloc_hooks(mempool_alloc_noprof(__VA_ARGS__))
-+
- extern void mempool_free(void *element, mempool_t *pool);
- 
- /*
-@@ -61,19 +77,10 @@ extern void mempool_free(void *element, mempool_t *pool);
- void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
- void mempool_free_slab(void *element, void *pool_data);
- 
--static inline int
--mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc)
--{
--	return mempool_init(pool, min_nr, mempool_alloc_slab,
--			    mempool_free_slab, (void *) kc);
--}
--
--static inline mempool_t *
--mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
--{
--	return mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab,
--			      (void *) kc);
--}
-+#define mempool_init_slab_pool(_pool, _min_nr, _kc)			\
-+	mempool_init(_pool, (_min_nr), mempool_alloc_slab, mempool_free_slab, (void *)(_kc))
-+#define mempool_create_slab_pool(_min_nr, _kc)			\
-+	mempool_create((_min_nr), mempool_alloc_slab, mempool_free_slab, (void *)(_kc))
- 
- /*
-  * a mempool_alloc_t and a mempool_free_t to kmalloc and kfree the
-@@ -82,17 +89,12 @@ mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
- void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data);
- void mempool_kfree(void *element, void *pool_data);
- 
--static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size)
--{
--	return mempool_init(pool, min_nr, mempool_kmalloc,
--			    mempool_kfree, (void *) size);
--}
--
--static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
--{
--	return mempool_create(min_nr, mempool_kmalloc, mempool_kfree,
--			      (void *) size);
--}
-+#define mempool_init_kmalloc_pool(_pool, _min_nr, _size)		\
-+	mempool_init(_pool, (_min_nr), mempool_kmalloc, mempool_kfree,	\
-+		     (void *)(unsigned long)(_size))
-+#define mempool_create_kmalloc_pool(_min_nr, _size)			\
-+	mempool_create((_min_nr), mempool_kmalloc, mempool_kfree,	\
-+		       (void *)(unsigned long)(_size))
- 
- /*
-  * A mempool_alloc_t and mempool_free_t for a simple page allocator that
-@@ -101,16 +103,11 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
- void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data);
- void mempool_free_pages(void *element, void *pool_data);
- 
--static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order)
--{
--	return mempool_init(pool, min_nr, mempool_alloc_pages,
--			    mempool_free_pages, (void *)(long)order);
--}
--
--static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
--{
--	return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages,
--			      (void *)(long)order);
--}
-+#define mempool_init_page_pool(_pool, _min_nr, _order)			\
-+	mempool_init(_pool, (_min_nr), mempool_alloc_pages,		\
-+		     mempool_free_pages, (void *)(long)(_order))
-+#define mempool_create_page_pool(_min_nr, _order)			\
-+	mempool_create((_min_nr), mempool_alloc_pages,			\
-+		       mempool_free_pages, (void *)(long)(_order))
- 
- #endif /* _LINUX_MEMPOOL_H */
-diff --git a/include/linux/mm.h b/include/linux/mm.h
-index 3c6c4c836..88b45fb4f 100644
---- a/include/linux/mm.h
-+++ b/include/linux/mm.h
-@@ -5,6 +5,7 @@
- #include <linux/errno.h>
- #include <linux/mmdebug.h>
- #include <linux/gfp.h>
-+#include <linux/pgalloc_tag.h>
- #include <linux/bug.h>
- #include <linux/list.h>
- #include <linux/mmzone.h>
-@@ -2925,6 +2926,13 @@ extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
- /* Free the reserved page into the buddy system, so it gets managed. */
- static inline void free_reserved_page(struct page *page)
- {
-+	union codetag_ref *ref;
-+
-+	ref = get_page_tag_ref(page);
-+	if (ref) {
-+		set_codetag_empty(ref);
-+		put_page_tag_ref(ref);
-+	}
- 	ClearPageReserved(page);
- 	init_page_count(page);
- 	__free_page(page);
-diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
-index de10fc797..888b87b3c 100644
---- a/include/linux/mm_types.h
-+++ b/include/linux/mm_types.h
-@@ -194,7 +194,7 @@ struct page {
- 	/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
- 	atomic_t _refcount;
- 
--#ifdef CONFIG_MEMCG
-+#ifdef CONFIG_SLAB_OBJ_EXT
- 	unsigned long memcg_data;
- #endif
- 
-@@ -320,7 +320,7 @@ struct folio {
- 			void *private;
- 			atomic_t _mapcount;
- 			atomic_t _refcount;
--#ifdef CONFIG_MEMCG
-+#ifdef CONFIG_SLAB_OBJ_EXT
- 			unsigned long memcg_data;
- #endif
- 	/* private: the union with struct page is transitional */
 diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
-index bb0ee8052..fda37b6df 100644
+index 8d07116ca..b61438313 100644
 --- a/include/linux/nodemask.h
 +++ b/include/linux/nodemask.h
 @@ -93,10 +93,10 @@
@@ -95996,211 +97215,6 @@ index 000000000..84c2f47c4
 +typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
 +
 +#endif /* __LINUX_NODEMASK_TYPES_H */
-diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
-index 67314f648..cff15ee54 100644
---- a/include/linux/page_ext.h
-+++ b/include/linux/page_ext.h
-@@ -4,7 +4,6 @@
- 
- #include <linux/types.h>
- #include <linux/stacktrace.h>
--#include <linux/stackdepot.h>
- 
- struct pglist_data;
- 
-diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
-index 08328b579..347ba7f86 100644
---- a/include/linux/pagemap.h
-+++ b/include/linux/pagemap.h
-@@ -467,14 +467,17 @@ static inline void *detach_page_private(struct page *page)
- }
- 
- #ifdef CONFIG_NUMA
--struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order);
-+struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
- #else
--static inline struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
-+static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
- {
--	return folio_alloc(gfp, order);
-+	return folio_alloc_noprof(gfp, order);
- }
- #endif
- 
-+#define filemap_alloc_folio(...)				\
-+	alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__))
-+
- static inline struct page *__page_cache_alloc(gfp_t gfp)
- {
- 	return &filemap_alloc_folio(gfp, 0)->page;
-diff --git a/include/linux/percpu.h b/include/linux/percpu.h
-index 1338ea2aa..dc50dedb0 100644
---- a/include/linux/percpu.h
-+++ b/include/linux/percpu.h
-@@ -2,12 +2,14 @@
- #ifndef __LINUX_PERCPU_H
- #define __LINUX_PERCPU_H
- 
-+#include <linux/alloc_tag.h>
- #include <linux/mmdebug.h>
- #include <linux/preempt.h>
- #include <linux/smp.h>
- #include <linux/cpumask.h>
- #include <linux/pfn.h>
- #include <linux/init.h>
-+#include <linux/sched.h>
- 
- #include <asm/percpu.h>
- 
-@@ -116,7 +118,6 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
- 				pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn);
- #endif
- 
--extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1);
- extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
- extern bool is_kernel_percpu_address(unsigned long addr);
- 
-@@ -124,10 +125,15 @@ extern bool is_kernel_percpu_address(unsigned long addr);
- extern void __init setup_per_cpu_areas(void);
- #endif
- 
--extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1);
--extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1);
--extern void free_percpu(void __percpu *__pdata);
--extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
-+extern void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
-+				   gfp_t gfp) __alloc_size(1);
-+
-+#define __alloc_percpu_gfp(_size, _align, _gfp)				\
-+	alloc_hooks(pcpu_alloc_noprof(_size, _align, false, _gfp))
-+#define __alloc_percpu(_size, _align)					\
-+	alloc_hooks(pcpu_alloc_noprof(_size, _align, false, GFP_KERNEL))
-+#define __alloc_reserved_percpu(_size, _align)				\
-+	alloc_hooks(pcpu_alloc_noprof(_size, _align, true, GFP_KERNEL))
- 
- #define alloc_percpu_gfp(type, gfp)					\
- 	(typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type),	\
-@@ -136,6 +142,9 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
- 	(typeof(type) __percpu *)__alloc_percpu(sizeof(type),		\
- 						__alignof__(type))
- 
-+extern void free_percpu(void __percpu *__pdata);
-+extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
-+
- extern unsigned long pcpu_nr_pages(void);
- 
- #endif /* __LINUX_PERCPU_H */
-diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
-new file mode 100644
-index 000000000..ae9b0f359
---- /dev/null
-+++ b/include/linux/pgalloc_tag.h
-@@ -0,0 +1,105 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * page allocation tagging
-+ */
-+#ifndef _LINUX_PGALLOC_TAG_H
-+#define _LINUX_PGALLOC_TAG_H
-+
-+#include <linux/alloc_tag.h>
-+
-+#ifdef CONFIG_MEM_ALLOC_PROFILING
-+
-+#include <linux/page_ext.h>
-+
-+extern struct page_ext_operations page_alloc_tagging_ops;
-+extern struct page_ext *page_ext_get(struct page *page);
-+extern void page_ext_put(struct page_ext *page_ext);
-+
-+static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext)
-+{
-+	return (void *)page_ext + page_alloc_tagging_ops.offset;
-+}
-+
-+static inline struct page_ext *page_ext_from_codetag_ref(union codetag_ref *ref)
-+{
-+	return (void *)ref - page_alloc_tagging_ops.offset;
-+}
-+
-+static inline union codetag_ref *get_page_tag_ref(struct page *page)
-+{
-+	if (page && mem_alloc_profiling_enabled()) {
-+		struct page_ext *page_ext = page_ext_get(page);
-+
-+		if (page_ext)
-+			return codetag_ref_from_page_ext(page_ext);
-+	}
-+	return NULL;
-+}
-+
-+static inline void put_page_tag_ref(union codetag_ref *ref)
-+{
-+	page_ext_put(page_ext_from_codetag_ref(ref));
-+}
-+
-+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
-+				   unsigned int order)
-+{
-+	union codetag_ref *ref = get_page_tag_ref(page);
-+
-+	if (ref) {
-+		alloc_tag_add(ref, task->alloc_tag, PAGE_SIZE << order);
-+		put_page_tag_ref(ref);
-+	}
-+}
-+
-+static inline void pgalloc_tag_sub(struct page *page, unsigned int order)
-+{
-+	union codetag_ref *ref = get_page_tag_ref(page);
-+
-+	if (ref) {
-+		alloc_tag_sub(ref, PAGE_SIZE << order);
-+		put_page_tag_ref(ref);
-+	}
-+}
-+
-+static inline void pgalloc_tag_split(struct page *page, unsigned int nr)
-+{
-+	int i;
-+	struct page_ext *page_ext;
-+	union codetag_ref *ref;
-+	struct alloc_tag *tag;
-+
-+	if (!mem_alloc_profiling_enabled())
-+		return;
-+
-+	page_ext = page_ext_get(page);
-+	if (unlikely(!page_ext))
-+		return;
-+
-+	ref = codetag_ref_from_page_ext(page_ext);
-+	if (!ref->ct)
-+		goto out;
-+
-+	tag = ct_to_alloc_tag(ref->ct);
-+	page_ext = page_ext_next(page_ext);
-+	for (i = 1; i < nr; i++) {
-+		/* New reference with 0 bytes accounted */
-+		alloc_tag_add(codetag_ref_from_page_ext(page_ext), tag, 0);
-+		page_ext = page_ext_next(page_ext);
-+	}
-+out:
-+	page_ext_put(page_ext);
-+}
-+
-+#else /* CONFIG_MEM_ALLOC_PROFILING */
-+
-+static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; }
-+static inline void put_page_tag_ref(union codetag_ref *ref) {}
-+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
-+				   unsigned int order) {}
-+static inline void pgalloc_tag_sub(struct page *page, unsigned int order) {}
-+static inline void pgalloc_tag_split(struct page *page, unsigned int nr) {}
-+
-+#endif /* CONFIG_MEM_ALLOC_PROFILING */
-+
-+#endif /* _LINUX_PGALLOC_TAG_H */
 diff --git a/include/linux/prandom.h b/include/linux/prandom.h
 index f2ed5b72b..f7f1e5251 100644
 --- a/include/linux/prandom.h
@@ -96213,43 +97227,8 @@ index f2ed5b72b..f7f1e5251 100644
  #include <linux/random.h>
  
  struct rnd_state {
-diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
-index 57467cbf4..92a8e670c 100644
---- a/include/linux/rhashtable-types.h
-+++ b/include/linux/rhashtable-types.h
-@@ -9,6 +9,7 @@
- #ifndef _LINUX_RHASHTABLE_TYPES_H
- #define _LINUX_RHASHTABLE_TYPES_H
- 
-+#include <linux/alloc_tag.h>
- #include <linux/atomic.h>
- #include <linux/compiler.h>
- #include <linux/mutex.h>
-@@ -88,6 +89,7 @@ struct rhashtable {
- 	struct mutex                    mutex;
- 	spinlock_t			lock;
- 	atomic_t			nelems;
-+	struct alloc_tag		*alloc_tag;
- };
- 
- /**
-@@ -127,9 +129,12 @@ struct rhashtable_iter {
- 	bool end_of_table;
- };
- 
--int rhashtable_init(struct rhashtable *ht,
-+int rhashtable_init_noprof(struct rhashtable *ht,
- 		    const struct rhashtable_params *params);
--int rhltable_init(struct rhltable *hlt,
-+#define rhashtable_init(...)	alloc_hooks(rhashtable_init_noprof(__VA_ARGS__))
-+
-+int rhltable_init_noprof(struct rhltable *hlt,
- 		  const struct rhashtable_params *params);
-+#define rhltable_init(...)	alloc_hooks(rhltable_init_noprof(__VA_ARGS__))
- 
- #endif /* _LINUX_RHASHTABLE_TYPES_H */
 diff --git a/include/linux/sched.h b/include/linux/sched.h
-index 847332470..5c359b8b2 100644
+index 609bde814..a82f63541 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -20,7 +20,7 @@
@@ -96261,26 +97240,7 @@ index 847332470..5c359b8b2 100644
  #include <linux/rcupdate.h>
  #include <linux/refcount.h>
  #include <linux/resource.h>
-@@ -763,6 +763,10 @@ struct task_struct {
- 	unsigned int			flags;
- 	unsigned int			ptrace;
- 
-+#ifdef CONFIG_MEM_ALLOC_PROFILING
-+	struct alloc_tag		*alloc_tag;
-+#endif
-+
- #ifdef CONFIG_SMP
- 	int				on_cpu;
- 	struct __call_single_node	wake_entry;
-@@ -802,6 +806,7 @@ struct task_struct {
- 	struct task_group		*sched_task_group;
- #endif
- 
-+
- #ifdef CONFIG_UCLAMP_TASK
- 	/*
- 	 * Clamp values requested for a scheduling entity.
-@@ -871,6 +876,7 @@ struct task_struct {
+@@ -870,6 +870,7 @@ struct task_struct {
  
  	struct mm_struct		*mm;
  	struct mm_struct		*active_mm;
@@ -96288,7 +97248,7 @@ index 847332470..5c359b8b2 100644
  
  	int				exit_state;
  	int				exit_code;
-@@ -1163,7 +1169,7 @@ struct task_struct {
+@@ -1162,7 +1163,7 @@ struct task_struct {
  #endif
  
  #ifdef CONFIG_LOCKDEP
@@ -96297,30 +97257,6 @@ index 847332470..5c359b8b2 100644
  	u64				curr_chain_key;
  	int				lockdep_depth;
  	unsigned int			lockdep_recursion;
-@@ -2446,4 +2452,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
- 
- extern void sched_set_stop_task(int cpu, struct task_struct *stop);
- 
-+#ifdef CONFIG_MEM_ALLOC_PROFILING
-+static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
-+{
-+	swap(current->alloc_tag, tag);
-+	return tag;
-+}
-+
-+static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
-+{
-+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
-+	WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
-+#endif
-+	current->alloc_tag = old;
-+}
-+#else
-+static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; }
-+#define alloc_tag_restore(_tag, _old)
-+#endif
-+
- #endif
 diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h
 index 515d7fcb9..cc02410f2 100644
 --- a/include/linux/seq_buf.h
@@ -96381,798 +97317,11 @@ index 224293b2d..a15a45d06 100644
  
  #ifdef CONFIG_SHRINKER_DEBUG
  extern int shrinker_debugfs_add(struct shrinker *shrinker);
-diff --git a/include/linux/six.h b/include/linux/six.h
-new file mode 100644
-index 000000000..394da423c
---- /dev/null
-+++ b/include/linux/six.h
-@@ -0,0 +1,388 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+
-+#ifndef _LINUX_SIX_H
-+#define _LINUX_SIX_H
-+
-+/**
-+ * DOC: SIX locks overview
-+ *
-+ * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
-+ * but with an additional state: read/shared, intent, exclusive/write
-+ *
-+ * The purpose of the intent state is to allow for greater concurrency on tree
-+ * structures without deadlocking. In general, a read can't be upgraded to a
-+ * write lock without deadlocking, so an operation that updates multiple nodes
-+ * will have to take write locks for the full duration of the operation.
-+ *
-+ * But by adding an intent state, which is exclusive with other intent locks but
-+ * not with readers, we can take intent locks at thte start of the operation,
-+ * and then take write locks only for the actual update to each individual
-+ * nodes, without deadlocking.
-+ *
-+ * Example usage:
-+ *   six_lock_read(&foo->lock);
-+ *   six_unlock_read(&foo->lock);
-+ *
-+ * An intent lock must be held before taking a write lock:
-+ *   six_lock_intent(&foo->lock);
-+ *   six_lock_write(&foo->lock);
-+ *   six_unlock_write(&foo->lock);
-+ *   six_unlock_intent(&foo->lock);
-+ *
-+ * Other operations:
-+ *   six_trylock_read()
-+ *   six_trylock_intent()
-+ *   six_trylock_write()
-+ *
-+ *   six_lock_downgrade()	convert from intent to read
-+ *   six_lock_tryupgrade()	attempt to convert from read to intent, may fail
-+ *
-+ * There are also interfaces that take the lock type as an enum:
-+ *
-+ *   six_lock_type(&foo->lock, SIX_LOCK_read);
-+ *   six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
-+ *   six_lock_type(&foo->lock, SIX_LOCK_write);
-+ *   six_unlock_type(&foo->lock, SIX_LOCK_write);
-+ *   six_unlock_type(&foo->lock, SIX_LOCK_intent);
-+ *
-+ * Lock sequence numbers - unlock(), relock():
-+ *
-+ *   Locks embed sequences numbers, which are incremented on write lock/unlock.
-+ *   This allows locks to be dropped and the retaken iff the state they protect
-+ *   hasn't changed; this makes it much easier to avoid holding locks while e.g.
-+ *   doing IO or allocating memory.
-+ *
-+ *   Example usage:
-+ *     six_lock_read(&foo->lock);
-+ *     u32 seq = six_lock_seq(&foo->lock);
-+ *     six_unlock_read(&foo->lock);
-+ *
-+ *     some_operation_that_may_block();
-+ *
-+ *     if (six_relock_read(&foo->lock, seq)) { ... }
-+ *
-+ *   If the relock operation succeeds, it is as if the lock was never unlocked.
-+ *
-+ * Reentrancy:
-+ *
-+ *   Six locks are not by themselves reentrent, but have counters for both the
-+ *   read and intent states that can be used to provide reentrency by an upper
-+ *   layer that tracks held locks. If a lock is known to already be held in the
-+ *   read or intent state, six_lock_increment() can be used to bump the "lock
-+ *   held in this state" counter, increasing the number of unlock calls that
-+ *   will be required to fully unlock it.
-+ *
-+ *   Example usage:
-+ *     six_lock_read(&foo->lock);
-+ *     six_lock_increment(&foo->lock, SIX_LOCK_read);
-+ *     six_unlock_read(&foo->lock);
-+ *     six_unlock_read(&foo->lock);
-+ *   foo->lock is now fully unlocked.
-+ *
-+ *   Since the intent state supercedes read, it's legal to increment the read
-+ *   counter when holding an intent lock, but not the reverse.
-+ *
-+ *   A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
-+ *   is not legal.
-+ *
-+ * should_sleep_fn:
-+ *
-+ *   There is a six_lock() variant that takes a function pointer that is called
-+ *   immediately prior to schedule() when blocking, and may return an error to
-+ *   abort.
-+ *
-+ *   One possible use for this feature is when objects being locked are part of
-+ *   a cache and may reused, and lock ordering is based on a property of the
-+ *   object that will change when the object is reused - i.e. logical key order.
-+ *
-+ *   If looking up an object in the cache may race with object reuse, and lock
-+ *   ordering is required to prevent deadlock, object reuse may change the
-+ *   correct lock order for that object and cause a deadlock. should_sleep_fn
-+ *   can be used to check if the object is still the object we want and avoid
-+ *   this deadlock.
-+ *
-+ * Wait list entry interface:
-+ *
-+ *   There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
-+ *   wait list entry. By embedding six_lock_waiter into another object, and by
-+ *   traversing lock waitlists, it is then possible for an upper layer to
-+ *   implement full cycle detection for deadlock avoidance.
-+ *
-+ *   should_sleep_fn should be used for invoking the cycle detector, walking the
-+ *   graph of held locks to check for a deadlock. The upper layer must track
-+ *   held locks for each thread, and each thread's held locks must be reachable
-+ *   from its six_lock_waiter object.
-+ *
-+ *   six_lock_waiter() will add the wait object to the waitlist re-trying taking
-+ *   the lock, and before calling should_sleep_fn, and the wait object will not
-+ *   be removed from the waitlist until either the lock has been successfully
-+ *   acquired, or we aborted because should_sleep_fn returned an error.
-+ *
-+ *   Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
-+ *   have timestamps in strictly ascending order - this is so the timestamp can
-+ *   be used as a cursor for lock graph traverse.
-+ */
-+
-+#include <linux/lockdep.h>
-+#include <linux/osq_lock.h>
-+#include <linux/sched.h>
-+#include <linux/types.h>
-+
-+enum six_lock_type {
-+	SIX_LOCK_read,
-+	SIX_LOCK_intent,
-+	SIX_LOCK_write,
-+};
-+
-+struct six_lock {
-+	atomic_t		state;
-+	u32			seq;
-+	unsigned		intent_lock_recurse;
-+	struct task_struct	*owner;
-+	unsigned __percpu	*readers;
-+	struct optimistic_spin_queue osq;
-+	raw_spinlock_t		wait_lock;
-+	struct list_head	wait_list;
-+#ifdef CONFIG_DEBUG_LOCK_ALLOC
-+	struct lockdep_map	dep_map;
-+#endif
-+};
-+
-+struct six_lock_waiter {
-+	struct list_head	list;
-+	struct task_struct	*task;
-+	enum six_lock_type	lock_want;
-+	bool			lock_acquired;
-+	u64			start_time;
-+};
-+
-+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
-+
-+void six_lock_exit(struct six_lock *lock);
-+
-+enum six_lock_init_flags {
-+	SIX_LOCK_INIT_PCPU	= 1U << 0,
-+};
-+
-+void __six_lock_init(struct six_lock *lock, const char *name,
-+		     struct lock_class_key *key, enum six_lock_init_flags flags);
-+
-+/**
-+ * six_lock_init - initialize a six lock
-+ * @lock:	lock to initialize
-+ * @flags:	optional flags, i.e. SIX_LOCK_INIT_PCPU
-+ */
-+#define six_lock_init(lock, flags)					\
-+do {									\
-+	static struct lock_class_key __key;				\
-+									\
-+	__six_lock_init((lock), #lock, &__key, flags);			\
-+} while (0)
-+
-+/**
-+ * six_lock_seq - obtain current lock sequence number
-+ * @lock:	six_lock to obtain sequence number for
-+ *
-+ * @lock should be held for read or intent, and not write
-+ *
-+ * By saving the lock sequence number, we can unlock @lock and then (typically
-+ * after some blocking operation) attempt to relock it: the relock will succeed
-+ * if the sequence number hasn't changed, meaning no write locks have been taken
-+ * and state corresponding to what @lock protects is still valid.
-+ */
-+static inline u32 six_lock_seq(const struct six_lock *lock)
-+{
-+	return lock->seq;
-+}
-+
-+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-+
-+/**
-+ * six_trylock_type - attempt to take a six lock without blocking
-+ * @lock:	lock to take
-+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
-+ *
-+ * Return: true on success, false on failure.
-+ */
-+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
-+{
-+	return six_trylock_ip(lock, type, _THIS_IP_);
-+}
-+
-+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
-+		       struct six_lock_waiter *wait,
-+		       six_lock_should_sleep_fn should_sleep_fn, void *p,
-+		       unsigned long ip);
-+
-+/**
-+ * six_lock_waiter - take a lock, with full waitlist interface
-+ * @lock:	lock to take
-+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
-+ * @wait:	pointer to wait object, which will be added to lock's waitlist
-+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
-+ *		to scheduling
-+ * @p:		passed through to @should_sleep_fn
-+ *
-+ * This is a convenience wrapper around six_lock_ip_waiter(), see that function
-+ * for full documentation.
-+ *
-+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
-+ */
-+static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
-+				  struct six_lock_waiter *wait,
-+				  six_lock_should_sleep_fn should_sleep_fn, void *p)
-+{
-+	return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
-+}
-+
-+/**
-+ * six_lock_ip - take a six lock lock
-+ * @lock:	lock to take
-+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
-+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
-+ *		to scheduling
-+ * @p:		passed through to @should_sleep_fn
-+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
-+ *
-+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
-+ */
-+static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
-+			      six_lock_should_sleep_fn should_sleep_fn, void *p,
-+			      unsigned long ip)
-+{
-+	struct six_lock_waiter wait;
-+
-+	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
-+}
-+
-+/**
-+ * six_lock_type - take a six lock lock
-+ * @lock:	lock to take
-+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
-+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
-+ *		to scheduling
-+ * @p:		passed through to @should_sleep_fn
-+ *
-+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
-+ */
-+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
-+				six_lock_should_sleep_fn should_sleep_fn, void *p)
-+{
-+	struct six_lock_waiter wait;
-+
-+	return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
-+}
-+
-+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
-+		   unsigned seq, unsigned long ip);
-+
-+/**
-+ * six_relock_type - attempt to re-take a lock that was held previously
-+ * @lock:	lock to take
-+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
-+ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
-+ *		held previously
-+ *
-+ * Return: true on success, false on failure.
-+ */
-+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
-+				   unsigned seq)
-+{
-+	return six_relock_ip(lock, type, seq, _THIS_IP_);
-+}
-+
-+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-+
-+/**
-+ * six_unlock_type - drop a six lock
-+ * @lock:	lock to unlock
-+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
-+ *
-+ * When a lock is held multiple times (because six_lock_incement()) was used),
-+ * this decrements the 'lock held' counter by one.
-+ *
-+ * For example:
-+ * six_lock_read(&foo->lock);				read count 1
-+ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
-+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
-+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
-+ */
-+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-+{
-+	six_unlock_ip(lock, type, _THIS_IP_);
-+}
-+
-+#define __SIX_LOCK(type)						\
-+static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
-+{									\
-+	return six_trylock_ip(lock, SIX_LOCK_##type, ip);		\
-+}									\
-+									\
-+static inline bool six_trylock_##type(struct six_lock *lock)		\
-+{									\
-+	return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_);	\
-+}									\
-+									\
-+static inline int six_lock_ip_waiter_##type(struct six_lock *lock,	\
-+			   struct six_lock_waiter *wait,		\
-+			   six_lock_should_sleep_fn should_sleep_fn, void *p,\
-+			   unsigned long ip)				\
-+{									\
-+	return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
-+}									\
-+									\
-+static inline int six_lock_ip_##type(struct six_lock *lock,		\
-+		    six_lock_should_sleep_fn should_sleep_fn, void *p,	\
-+		    unsigned long ip)					\
-+{									\
-+	return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
-+}									\
-+									\
-+static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
-+{									\
-+	return six_relock_ip(lock, SIX_LOCK_##type, seq, ip);		\
-+}									\
-+									\
-+static inline bool six_relock_##type(struct six_lock *lock, u32 seq)	\
-+{									\
-+	return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_);	\
-+}									\
-+									\
-+static inline int six_lock_##type(struct six_lock *lock,		\
-+				  six_lock_should_sleep_fn fn, void *p)\
-+{									\
-+	return six_lock_ip_##type(lock, fn, p, _THIS_IP_);		\
-+}									\
-+									\
-+static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)	\
-+{									\
-+	six_unlock_ip(lock, SIX_LOCK_##type, ip);			\
-+}									\
-+									\
-+static inline void six_unlock_##type(struct six_lock *lock)		\
-+{									\
-+	six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_);		\
-+}
-+
-+__SIX_LOCK(read)
-+__SIX_LOCK(intent)
-+__SIX_LOCK(write)
-+#undef __SIX_LOCK
-+
-+void six_lock_downgrade(struct six_lock *);
-+bool six_lock_tryupgrade(struct six_lock *);
-+bool six_trylock_convert(struct six_lock *, enum six_lock_type,
-+			 enum six_lock_type);
-+
-+void six_lock_increment(struct six_lock *, enum six_lock_type);
-+
-+void six_lock_wakeup_all(struct six_lock *);
-+
-+struct six_lock_count {
-+	unsigned n[3];
-+};
-+
-+struct six_lock_count six_lock_counts(struct six_lock *);
-+void six_lock_readers_add(struct six_lock *, int);
-+
-+#endif /* _LINUX_SIX_H */
-diff --git a/include/linux/slab.h b/include/linux/slab.h
-index 6b3e155b7..f7bc3ab70 100644
---- a/include/linux/slab.h
-+++ b/include/linux/slab.h
-@@ -147,6 +147,13 @@
- #endif
- #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
- 
-+#ifdef CONFIG_SLAB_OBJ_EXT
-+/* Slab created using create_boot_cache */
-+#define SLAB_NO_OBJ_EXT         ((slab_flags_t __force)0x20000000U)
-+#else
-+#define SLAB_NO_OBJ_EXT         0
-+#endif
-+
- /*
-  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
-  *
-@@ -206,7 +213,9 @@ int kmem_cache_shrink(struct kmem_cache *s);
- /*
-  * Common kmalloc functions provided by all allocators
-  */
--void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2);
-+void * __must_check krealloc_noprof(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2);
-+#define krealloc(...)				alloc_hooks(krealloc_noprof(__VA_ARGS__))
-+
- void kfree(const void *objp);
- void kfree_sensitive(const void *objp);
- size_t __ksize(const void *objp);
-@@ -444,7 +453,10 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
- static_assert(PAGE_SHIFT <= 20);
- #define kmalloc_index(s) __kmalloc_index(s, true)
- 
--void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1);
-+#include <linux/alloc_tag.h>
-+
-+void *__kmalloc_noprof(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1);
-+#define __kmalloc(...)				alloc_hooks(__kmalloc_noprof(__VA_ARGS__))
- 
- /**
-  * kmem_cache_alloc - Allocate an object
-@@ -456,9 +468,13 @@ void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_siz
-  *
-  * Return: pointer to the new object or %NULL in case of error
-  */
--void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) __assume_slab_alignment __malloc;
--void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
--			   gfp_t gfpflags) __assume_slab_alignment __malloc;
-+void *kmem_cache_alloc_noprof(struct kmem_cache *cachep, gfp_t flags) __assume_slab_alignment __malloc;
-+#define kmem_cache_alloc(...)			alloc_hooks(kmem_cache_alloc_noprof(__VA_ARGS__))
-+
-+void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
-+			    gfp_t gfpflags) __assume_slab_alignment __malloc;
-+#define kmem_cache_alloc_lru(...)		alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__))
-+
- void kmem_cache_free(struct kmem_cache *s, void *objp);
- 
- /*
-@@ -469,29 +485,40 @@ void kmem_cache_free(struct kmem_cache *s, void *objp);
-  * Note that interrupts must be enabled when calling these functions.
-  */
- void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
--int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
-+
-+int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
-+#define kmem_cache_alloc_bulk(...)		alloc_hooks(kmem_cache_alloc_bulk_noprof(__VA_ARGS__))
- 
- static __always_inline void kfree_bulk(size_t size, void **p)
- {
- 	kmem_cache_free_bulk(NULL, size, p);
- }
- 
--void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment
-+void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment
- 							 __alloc_size(1);
--void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment
--									 __malloc;
-+#define __kmalloc_node(...)			alloc_hooks(__kmalloc_node_noprof(__VA_ARGS__))
- 
--void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size)
-+void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment
-+									  __malloc;
-+#define kmem_cache_alloc_node(...)		alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__))
-+
-+void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t flags, size_t size)
- 		    __assume_kmalloc_alignment __alloc_size(3);
- 
--void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
--			 int node, size_t size) __assume_kmalloc_alignment
-+void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags,
-+		int node, size_t size) __assume_kmalloc_alignment
- 						__alloc_size(4);
--void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment
-+#define kmalloc_trace(...)			alloc_hooks(kmalloc_trace_noprof(__VA_ARGS__))
-+
-+#define kmalloc_node_trace(...)			alloc_hooks(kmalloc_node_trace_noprof(__VA_ARGS__))
-+
-+void *kmalloc_large_noprof(size_t size, gfp_t flags) __assume_page_alignment
- 					      __alloc_size(1);
-+#define kmalloc_large(...)			alloc_hooks(kmalloc_large_noprof(__VA_ARGS__))
- 
--void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_alignment
-+void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) __assume_page_alignment
- 							     __alloc_size(1);
-+#define kmalloc_large_node(...)			alloc_hooks(kmalloc_large_node_noprof(__VA_ARGS__))
- 
- /**
-  * kmalloc - allocate kernel memory
-@@ -547,37 +574,39 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align
-  *	Try really hard to succeed the allocation but fail
-  *	eventually.
-  */
--static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
-+static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t flags)
- {
- 	if (__builtin_constant_p(size) && size) {
- 		unsigned int index;
- 
- 		if (size > KMALLOC_MAX_CACHE_SIZE)
--			return kmalloc_large(size, flags);
-+			return kmalloc_large_noprof(size, flags);
- 
- 		index = kmalloc_index(size);
--		return kmalloc_trace(
-+		return kmalloc_trace_noprof(
- 				kmalloc_caches[kmalloc_type(flags)][index],
- 				flags, size);
- 	}
--	return __kmalloc(size, flags);
-+	return __kmalloc_noprof(size, flags);
- }
-+#define kmalloc(...)				alloc_hooks(kmalloc_noprof(__VA_ARGS__))
- 
--static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node)
-+static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gfp_t flags, int node)
- {
- 	if (__builtin_constant_p(size) && size) {
- 		unsigned int index;
- 
- 		if (size > KMALLOC_MAX_CACHE_SIZE)
--			return kmalloc_large_node(size, flags, node);
-+			return kmalloc_large_node_noprof(size, flags, node);
- 
- 		index = kmalloc_index(size);
--		return kmalloc_node_trace(
-+		return kmalloc_node_trace_noprof(
- 				kmalloc_caches[kmalloc_type(flags)][index],
- 				flags, node, size);
- 	}
--	return __kmalloc_node(size, flags, node);
-+	return __kmalloc_node_noprof(size, flags, node);
- }
-+#define kmalloc_node(...)			alloc_hooks(kmalloc_node_noprof(__VA_ARGS__))
- 
- /**
-  * kmalloc_array - allocate memory for an array.
-@@ -585,16 +614,17 @@ static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t fla
-  * @size: element size.
-  * @flags: the type of memory to allocate (see kmalloc).
-  */
--static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_t flags)
-+static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
- {
- 	size_t bytes;
- 
- 	if (unlikely(check_mul_overflow(n, size, &bytes)))
- 		return NULL;
- 	if (__builtin_constant_p(n) && __builtin_constant_p(size))
--		return kmalloc(bytes, flags);
--	return __kmalloc(bytes, flags);
-+		return kmalloc_noprof(bytes, flags);
-+	return kmalloc_noprof(bytes, flags);
- }
-+#define kmalloc_array(...)			alloc_hooks(kmalloc_array_noprof(__VA_ARGS__))
- 
- /**
-  * krealloc_array - reallocate memory for an array.
-@@ -603,18 +633,19 @@ static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_
-  * @new_size: new size of a single member of the array
-  * @flags: the type of memory to allocate (see kmalloc)
-  */
--static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p,
--								      size_t new_n,
--								      size_t new_size,
--								      gfp_t flags)
-+static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p,
-+								       size_t new_n,
-+								       size_t new_size,
-+								       gfp_t flags)
- {
- 	size_t bytes;
- 
- 	if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
- 		return NULL;
- 
--	return krealloc(p, bytes, flags);
-+	return krealloc_noprof(p, bytes, flags);
- }
-+#define krealloc_array(...)			alloc_hooks(krealloc_array_noprof(__VA_ARGS__))
- 
- /**
-  * kcalloc - allocate memory for an array. The memory is set to zero.
-@@ -622,16 +653,11 @@ static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p,
-  * @size: element size.
-  * @flags: the type of memory to allocate (see kmalloc).
-  */
--static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flags)
--{
--	return kmalloc_array(n, size, flags | __GFP_ZERO);
--}
-+#define kcalloc(_n, _size, _flags)		kmalloc_array(_n, _size, (_flags) | __GFP_ZERO)
- 
--void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
-+void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, int node,
- 				  unsigned long caller) __alloc_size(1);
--#define kmalloc_node_track_caller(size, flags, node) \
--	__kmalloc_node_track_caller(size, flags, node, \
--				    _RET_IP_)
-+#define kmalloc_node_track_caller(...)		alloc_hooks(kmalloc_node_track_caller_noprof(__VA_ARGS__, _RET_IP_))
- 
- /*
-  * kmalloc_track_caller is a special version of kmalloc that records the
-@@ -641,11 +667,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
-  * allocator where we care about the real place the memory allocation
-  * request comes from.
-  */
--#define kmalloc_track_caller(size, flags) \
--	__kmalloc_node_track_caller(size, flags, \
--				    NUMA_NO_NODE, _RET_IP_)
-+#define kmalloc_track_caller(...)		kmalloc_node_track_caller(__VA_ARGS__, NUMA_NO_NODE)
- 
--static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
-+static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags,
- 							  int node)
- {
- 	size_t bytes;
-@@ -653,75 +677,51 @@ static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size,
- 	if (unlikely(check_mul_overflow(n, size, &bytes)))
- 		return NULL;
- 	if (__builtin_constant_p(n) && __builtin_constant_p(size))
--		return kmalloc_node(bytes, flags, node);
--	return __kmalloc_node(bytes, flags, node);
-+		return kmalloc_node_noprof(bytes, flags, node);
-+	return __kmalloc_node_noprof(bytes, flags, node);
- }
-+#define kmalloc_array_node(...)			alloc_hooks(kmalloc_array_node_noprof(__VA_ARGS__))
- 
--static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
--{
--	return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
--}
-+#define kcalloc_node(_n, _size, _flags, _node)	kmalloc_array_node(_n, _size, (_flags) | __GFP_ZERO, _node)
- 
- /*
-  * Shortcuts
-  */
--static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
--{
--	return kmem_cache_alloc(k, flags | __GFP_ZERO);
--}
-+#define kmem_cache_zalloc(_k, _flags)		kmem_cache_alloc(_k, (_flags)|__GFP_ZERO)
- 
- /**
-  * kzalloc - allocate memory. The memory is set to zero.
-  * @size: how many bytes of memory are required.
-  * @flags: the type of memory to allocate (see kmalloc).
-  */
--static inline __alloc_size(1) void *kzalloc(size_t size, gfp_t flags)
--{
--	return kmalloc(size, flags | __GFP_ZERO);
--}
--
--/**
-- * kzalloc_node - allocate zeroed memory from a particular memory node.
-- * @size: how many bytes of memory are required.
-- * @flags: the type of memory to allocate (see kmalloc).
-- * @node: memory node from which to allocate
-- */
--static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int node)
-+static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags)
- {
--	return kmalloc_node(size, flags | __GFP_ZERO, node);
-+	return kmalloc_noprof(size, flags | __GFP_ZERO);
- }
-+#define kzalloc(...)				alloc_hooks(kzalloc_noprof(__VA_ARGS__))
-+#define kzalloc_node(_size, _flags, _node)	kmalloc_node(_size, (_flags)|__GFP_ZERO, _node)
- 
--extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __alloc_size(1);
--static inline __alloc_size(1) void *kvmalloc(size_t size, gfp_t flags)
--{
--	return kvmalloc_node(size, flags, NUMA_NO_NODE);
--}
--static inline __alloc_size(1) void *kvzalloc_node(size_t size, gfp_t flags, int node)
--{
--	return kvmalloc_node(size, flags | __GFP_ZERO, node);
--}
--static inline __alloc_size(1) void *kvzalloc(size_t size, gfp_t flags)
--{
--	return kvmalloc(size, flags | __GFP_ZERO);
--}
-+extern void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) __alloc_size(1);
-+#define kvmalloc_node(...)			alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__))
- 
--static inline __alloc_size(1, 2) void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
--{
--	size_t bytes;
-+#define kvmalloc(_size, _flags)			kvmalloc_node(_size, _flags, NUMA_NO_NODE)
-+#define kvzalloc(_size, _flags)			kvmalloc(_size, _flags|__GFP_ZERO)
- 
--	if (unlikely(check_mul_overflow(n, size, &bytes)))
--		return NULL;
-+#define kvzalloc_node(_size, _flags, _node)	kvmalloc_node(_size, _flags|__GFP_ZERO, _node)
- 
--	return kvmalloc(bytes, flags);
--}
-+#define kvmalloc_array(_n, _size, _flags)						\
-+({											\
-+	size_t _bytes;									\
-+											\
-+	!check_mul_overflow(_n, _size, &_bytes) ? kvmalloc(_bytes, _flags) : NULL;	\
-+})
- 
--static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t flags)
--{
--	return kvmalloc_array(n, size, flags | __GFP_ZERO);
--}
-+#define kvcalloc(_n, _size, _flags)		kvmalloc_array(_n, _size, _flags|__GFP_ZERO)
- 
--extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
-+extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
- 		      __realloc_size(3);
-+#define kvrealloc(...)				alloc_hooks(kvrealloc_noprof(__VA_ARGS__))
-+
- extern void kvfree(const void *addr);
- extern void kvfree_sensitive(const void *addr, size_t len);
- 
-diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
-index a61e7d55d..23f14dcb8 100644
---- a/include/linux/slab_def.h
-+++ b/include/linux/slab_def.h
-@@ -107,7 +107,7 @@ static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *sla
-  *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
-  */
- static inline unsigned int obj_to_index(const struct kmem_cache *cache,
--					const struct slab *slab, void *obj)
-+					const struct slab *slab, const void *obj)
- {
- 	u32 offset = (obj - slab->s_mem);
- 	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
-diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
-index f6df03f93..e8be5b368 100644
---- a/include/linux/slub_def.h
-+++ b/include/linux/slub_def.h
-@@ -176,14 +176,14 @@ static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *sla
- 
- /* Determine object index from a given position */
- static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
--					  void *addr, void *obj)
-+					  void *addr, const void *obj)
- {
- 	return reciprocal_divide(kasan_reset_tag(obj) - addr,
- 				 cache->reciprocal_size);
- }
- 
- static inline unsigned int obj_to_index(const struct kmem_cache *cache,
--					const struct slab *slab, void *obj)
-+					const struct slab *slab, const void *obj)
- {
- 	if (is_kfence_address(obj))
- 		return 0;
-diff --git a/include/linux/string.h b/include/linux/string.h
-index c062c581a..198ca51ed 100644
---- a/include/linux/string.h
-+++ b/include/linux/string.h
-@@ -96,6 +96,7 @@ extern char * strpbrk(const char *,const char *);
- #ifndef __HAVE_ARCH_STRSEP
- extern char * strsep(char **,const char *);
- #endif
-+extern char *strsep_no_empty(char **, const char *);
- #ifndef __HAVE_ARCH_STRSPN
- extern __kernel_size_t strspn(const char *,const char *);
- #endif
-@@ -176,7 +177,9 @@ extern void kfree_const(const void *x);
- extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
- extern const char *kstrdup_const(const char *s, gfp_t gfp);
- extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
--extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
-+extern void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
-+#define kmemdup(...)	alloc_hooks(kmemdup_noprof(__VA_ARGS__))
-+
- extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
- extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp);
- 
 diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
-index fae6beaaa..ae51580b9 100644
+index 789ab3004..1cc137402 100644
 --- a/include/linux/string_helpers.h
 +++ b/include/linux/string_helpers.h
-@@ -16,15 +16,14 @@ static inline bool string_is_terminated(const char *s, int len)
+@@ -17,15 +17,14 @@ static inline bool string_is_terminated(const char *s, int len)
  	return memchr(s, '\0', len) ? true : false;
  }
  
@@ -97194,121 +97343,6 @@ index fae6beaaa..ae51580b9 100644
  
  int parse_int_array_user(const char __user *from, size_t count, int **array);
  
-diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h
-index bb9d3f554..d8e0cacfc 100644
---- a/include/linux/time_namespace.h
-+++ b/include/linux/time_namespace.h
-@@ -11,6 +11,8 @@
- struct user_namespace;
- extern struct user_namespace init_user_ns;
- 
-+struct vm_area_struct;
-+
- struct timens_offsets {
- 	struct timespec64 monotonic;
- 	struct timespec64 boottime;
-diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
-index c720be70c..106d78e75 100644
---- a/include/linux/vmalloc.h
-+++ b/include/linux/vmalloc.h
-@@ -2,6 +2,8 @@
- #ifndef _LINUX_VMALLOC_H
- #define _LINUX_VMALLOC_H
- 
-+#include <linux/alloc_tag.h>
-+#include <linux/sched.h>
- #include <linux/spinlock.h>
- #include <linux/init.h>
- #include <linux/list.h>
-@@ -137,26 +139,54 @@ extern unsigned long vmalloc_nr_pages(void);
- static inline unsigned long vmalloc_nr_pages(void) { return 0; }
- #endif
- 
--extern void *vmalloc(unsigned long size) __alloc_size(1);
--extern void *vzalloc(unsigned long size) __alloc_size(1);
--extern void *vmalloc_user(unsigned long size) __alloc_size(1);
--extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1);
--extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1);
--extern void *vmalloc_32(unsigned long size) __alloc_size(1);
--extern void *vmalloc_32_user(unsigned long size) __alloc_size(1);
--extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
--extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
-+extern void *vmalloc_noprof(unsigned long size) __alloc_size(1);
-+#define vmalloc(...)		alloc_hooks(vmalloc_noprof(__VA_ARGS__))
-+
-+extern void *vzalloc_noprof(unsigned long size) __alloc_size(1);
-+#define vzalloc(...)		alloc_hooks(vzalloc_noprof(__VA_ARGS__))
-+
-+extern void *vmalloc_user_noprof(unsigned long size) __alloc_size(1);
-+#define vmalloc_user(...)	alloc_hooks(vmalloc_user_noprof(__VA_ARGS__))
-+
-+extern void *vmalloc_node_noprof(unsigned long size, int node) __alloc_size(1);
-+#define vmalloc_node(...)	alloc_hooks(vmalloc_node_noprof(__VA_ARGS__))
-+
-+extern void *vzalloc_node_noprof(unsigned long size, int node) __alloc_size(1);
-+#define vzalloc_node(...)	alloc_hooks(vzalloc_node_noprof(__VA_ARGS__))
-+
-+extern void *vmalloc_32_noprof(unsigned long size) __alloc_size(1);
-+#define vmalloc_32(...)		alloc_hooks(vmalloc_32_noprof(__VA_ARGS__))
-+
-+extern void *vmalloc_32_user_noprof(unsigned long size) __alloc_size(1);
-+#define vmalloc_32_user(...)	alloc_hooks(vmalloc_32_user_noprof(__VA_ARGS__))
-+
-+extern void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
-+#define __vmalloc(...)		alloc_hooks(__vmalloc_noprof(__VA_ARGS__))
-+
-+extern void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
- 			unsigned long start, unsigned long end, gfp_t gfp_mask,
- 			pgprot_t prot, unsigned long vm_flags, int node,
- 			const void *caller) __alloc_size(1);
--void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
-+#define __vmalloc_node_range(...)	alloc_hooks(__vmalloc_node_range_noprof(__VA_ARGS__))
-+
-+void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask,
- 		int node, const void *caller) __alloc_size(1);
--void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
-+#define __vmalloc_node(...)	alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__))
-+
-+void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
-+#define vmalloc_huge(...)	alloc_hooks(vmalloc_huge_noprof(__VA_ARGS__))
-+
-+extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
-+#define __vmalloc_array(...)	alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__))
-+
-+extern void *vmalloc_array_noprof(size_t n, size_t size) __alloc_size(1, 2);
-+#define vmalloc_array(...)	alloc_hooks(vmalloc_array_noprof(__VA_ARGS__))
-+
-+extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
-+#define __vcalloc(...)		alloc_hooks(__vcalloc_noprof(__VA_ARGS__))
- 
--extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
--extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2);
--extern void *__vcalloc(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
--extern void *vcalloc(size_t n, size_t size) __alloc_size(1, 2);
-+extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2);
-+#define vcalloc(...)		alloc_hooks(vcalloc_noprof(__VA_ARGS__))
- 
- extern void vfree(const void *addr);
- extern void vfree_atomic(const void *addr);
-diff --git a/init/Kconfig b/init/Kconfig
-index b6d38eccc..cec6bac1a 100644
---- a/init/Kconfig
-+++ b/init/Kconfig
-@@ -940,10 +940,14 @@ config CGROUP_FAVOR_DYNMODS
- 
-           Say N if unsure.
- 
-+config SLAB_OBJ_EXT
-+	bool
-+
- config MEMCG
- 	bool "Memory controller"
- 	select PAGE_COUNTER
- 	select EVENTFD
-+	select SLAB_OBJ_EXT
- 	help
- 	  Provides control over the memory footprint of tasks in a cgroup.
- 
 diff --git a/init/init_task.c b/init/init_task.c
 index ff6c4b9bf..f703116e0 100644
 --- a/init/init_task.c
@@ -97321,57 +97355,21 @@ index ff6c4b9bf..f703116e0 100644
  	.restart_block	= {
  		.fn = do_no_restart_syscall,
  	},
-diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
-index 4198f0273..b2abd9a5d 100644
---- a/kernel/Kconfig.locks
-+++ b/kernel/Kconfig.locks
-@@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB
- config MMIOWB
- 	def_bool y if ARCH_HAS_MMIOWB
- 	depends on SMP
-+
-+config SIXLOCKS
-+	bool
-diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
-index 9a4db5cce..fc42930af 100644
---- a/kernel/dma/mapping.c
-+++ b/kernel/dma/mapping.c
-@@ -570,9 +570,9 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size,
- 	size = PAGE_ALIGN(size);
- 	if (dma_alloc_direct(dev, ops))
- 		return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
--	if (!ops->alloc_pages)
-+	if (!ops->alloc_pages_op)
- 		return NULL;
--	return ops->alloc_pages(dev, size, dma_handle, dir, gfp);
-+	return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp);
- }
- 
- struct page *dma_alloc_pages(struct device *dev, size_t size,
-diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
-index 0db4093d1..a095dbbf0 100644
---- a/kernel/locking/Makefile
-+++ b/kernel/locking/Makefile
-@@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
- obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
- obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
- obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
-+obj-$(CONFIG_SIXLOCKS) += six.o
 diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
-index 4dfd2f3e0..0463302e2 100644
+index 111607d91..b6c3a8788 100644
 --- a/kernel/locking/lockdep.c
 +++ b/kernel/locking/lockdep.c
-@@ -3039,6 +3039,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)
- 		if ((next->read == 2) && prev->read)
- 			continue;
+@@ -3056,6 +3056,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)
  
-+		if (hlock_class(next)->no_check_recursion)
+ 		class = hlock_class(prev);
+ 
++		if (class->no_check_recursion)
 +			continue;
 +
- 		/*
- 		 * We're holding the nest_lock, which serializes this lock's
- 		 * nesting behaviour.
-@@ -3100,6 +3103,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
+ 		if (class->cmp_fn &&
+ 		    class->cmp_fn(prev->instance, next->instance) < 0)
+ 			continue;
+@@ -3121,6 +3124,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
  		return 2;
  	}
  
@@ -97379,10 +97377,10 @@ index 4dfd2f3e0..0463302e2 100644
 +	    hlock_class(prev)->no_check_recursion)
 +		return 2;
 +
- 	/*
- 	 * Prove that the new <prev> -> <next> dependency would not
- 	 * create a circular dependency in the graph. (We do this by
-@@ -6551,6 +6558,26 @@ void debug_check_no_locks_held(void)
+ 	if (prev->class_idx == next->class_idx) {
+ 		struct lock_class *class = hlock_class(prev);
+ 
+@@ -6607,6 +6614,26 @@ void debug_check_no_locks_held(void)
  }
  EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
  
@@ -97409,7 +97407,7 @@ index 4dfd2f3e0..0463302e2 100644
  #ifdef __KERNEL__
  void debug_show_all_locks(void)
  {
-@@ -6664,3 +6691,22 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
+@@ -6720,3 +6747,22 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
  	warn_rcu_exit(rcu);
  }
  EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
@@ -97432,6 +97430,20 @@ index 4dfd2f3e0..0463302e2 100644
 +}
 +EXPORT_SYMBOL_GPL(lockdep_set_no_check_recursion);
 +#endif
+diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
+index d973fe604..2deeeca3e 100644
+--- a/kernel/locking/mutex.c
++++ b/kernel/locking/mutex.c
+@@ -1126,6 +1126,9 @@ EXPORT_SYMBOL(ww_mutex_lock_interruptible);
+ #endif /* !CONFIG_DEBUG_LOCK_ALLOC */
+ #endif /* !CONFIG_PREEMPT_RT */
+ 
++EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin);
++EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end);
++
+ /**
+  * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+  * @cnt: the atomic which we are to dec
 diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
 index d5610ad52..b752ec5cc 100644
 --- a/kernel/locking/osq_lock.c
@@ -97449,1001 +97461,6 @@ index d5610ad52..b752ec5cc 100644
  		WRITE_ONCE(next->locked, 1);
  }
 +EXPORT_SYMBOL_GPL(osq_unlock);
-diff --git a/kernel/locking/six.c b/kernel/locking/six.c
-new file mode 100644
-index 000000000..0b9c4bb7c
---- /dev/null
-+++ b/kernel/locking/six.c
-@@ -0,0 +1,893 @@
-+// SPDX-License-Identifier: GPL-2.0
-+
-+#include <linux/export.h>
-+#include <linux/log2.h>
-+#include <linux/percpu.h>
-+#include <linux/preempt.h>
-+#include <linux/rcupdate.h>
-+#include <linux/sched.h>
-+#include <linux/sched/clock.h>
-+#include <linux/sched/rt.h>
-+#include <linux/six.h>
-+#include <linux/slab.h>
-+
-+#include <trace/events/lock.h>
-+
-+#ifdef DEBUG
-+#define EBUG_ON(cond)			BUG_ON(cond)
-+#else
-+#define EBUG_ON(cond)			do {} while (0)
-+#endif
-+
-+#define six_acquire(l, t, r, ip)	lock_acquire(l, 0, t, r, 1, NULL, ip)
-+#define six_release(l, ip)		lock_release(l, ip)
-+
-+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
-+
-+#define SIX_LOCK_HELD_read_OFFSET	0
-+#define SIX_LOCK_HELD_read		~(~0U << 26)
-+#define SIX_LOCK_HELD_intent		(1U << 26)
-+#define SIX_LOCK_HELD_write		(1U << 27)
-+#define SIX_LOCK_WAITING_read		(1U << (28 + SIX_LOCK_read))
-+#define SIX_LOCK_WAITING_intent		(1U << (28 + SIX_LOCK_intent))
-+#define SIX_LOCK_WAITING_write		(1U << (28 + SIX_LOCK_write))
-+#define SIX_LOCK_NOSPIN			(1U << 31)
-+
-+struct six_lock_vals {
-+	/* Value we add to the lock in order to take the lock: */
-+	u32			lock_val;
-+
-+	/* If the lock has this value (used as a mask), taking the lock fails: */
-+	u32			lock_fail;
-+
-+	/* Mask that indicates lock is held for this type: */
-+	u32			held_mask;
-+
-+	/* Waitlist we wakeup when releasing the lock: */
-+	enum six_lock_type	unlock_wakeup;
-+};
-+
-+static const struct six_lock_vals l[] = {
-+	[SIX_LOCK_read] = {
-+		.lock_val	= 1U << SIX_LOCK_HELD_read_OFFSET,
-+		.lock_fail	= SIX_LOCK_HELD_write,
-+		.held_mask	= SIX_LOCK_HELD_read,
-+		.unlock_wakeup	= SIX_LOCK_write,
-+	},
-+	[SIX_LOCK_intent] = {
-+		.lock_val	= SIX_LOCK_HELD_intent,
-+		.lock_fail	= SIX_LOCK_HELD_intent,
-+		.held_mask	= SIX_LOCK_HELD_intent,
-+		.unlock_wakeup	= SIX_LOCK_intent,
-+	},
-+	[SIX_LOCK_write] = {
-+		.lock_val	= SIX_LOCK_HELD_write,
-+		.lock_fail	= SIX_LOCK_HELD_read,
-+		.held_mask	= SIX_LOCK_HELD_write,
-+		.unlock_wakeup	= SIX_LOCK_read,
-+	},
-+};
-+
-+static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
-+{
-+	if ((atomic_read(&lock->state) & mask) != mask)
-+		atomic_or(mask, &lock->state);
-+}
-+
-+static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
-+{
-+	if (atomic_read(&lock->state) & mask)
-+		atomic_and(~mask, &lock->state);
-+}
-+
-+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
-+				 u32 old, struct task_struct *owner)
-+{
-+	if (type != SIX_LOCK_intent)
-+		return;
-+
-+	if (!(old & SIX_LOCK_HELD_intent)) {
-+		EBUG_ON(lock->owner);
-+		lock->owner = owner;
-+	} else {
-+		EBUG_ON(lock->owner != current);
-+	}
-+}
-+
-+static inline unsigned pcpu_read_count(struct six_lock *lock)
-+{
-+	unsigned read_count = 0;
-+	int cpu;
-+
-+	for_each_possible_cpu(cpu)
-+		read_count += *per_cpu_ptr(lock->readers, cpu);
-+	return read_count;
-+}
-+
-+/*
-+ * __do_six_trylock() - main trylock routine
-+ *
-+ * Returns 1 on success, 0 on failure
-+ *
-+ * In percpu reader mode, a failed trylock may cause a spurious trylock failure
-+ * for anoter thread taking the competing lock type, and we may havve to do a
-+ * wakeup: when a wakeup is required, we return -1 - wakeup_type.
-+ */
-+static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
-+			    struct task_struct *task, bool try)
-+{
-+	int ret;
-+	u32 old;
-+
-+	EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
-+	EBUG_ON(type == SIX_LOCK_write &&
-+		(try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
-+
-+	/*
-+	 * Percpu reader mode:
-+	 *
-+	 * The basic idea behind this algorithm is that you can implement a lock
-+	 * between two threads without any atomics, just memory barriers:
-+	 *
-+	 * For two threads you'll need two variables, one variable for "thread a
-+	 * has the lock" and another for "thread b has the lock".
-+	 *
-+	 * To take the lock, a thread sets its variable indicating that it holds
-+	 * the lock, then issues a full memory barrier, then reads from the
-+	 * other thread's variable to check if the other thread thinks it has
-+	 * the lock. If we raced, we backoff and retry/sleep.
-+	 *
-+	 * Failure to take the lock may cause a spurious trylock failure in
-+	 * another thread, because we temporarily set the lock to indicate that
-+	 * we held it. This would be a problem for a thread in six_lock(), when
-+	 * they are calling trylock after adding themself to the waitlist and
-+	 * prior to sleeping.
-+	 *
-+	 * Therefore, if we fail to get the lock, and there were waiters of the
-+	 * type we conflict with, we will have to issue a wakeup.
-+	 *
-+	 * Since we may be called under wait_lock (and by the wakeup code
-+	 * itself), we return that the wakeup has to be done instead of doing it
-+	 * here.
-+	 */
-+	if (type == SIX_LOCK_read && lock->readers) {
-+		preempt_disable();
-+		this_cpu_inc(*lock->readers); /* signal that we own lock */
-+
-+		smp_mb();
-+
-+		old = atomic_read(&lock->state);
-+		ret = !(old & l[type].lock_fail);
-+
-+		this_cpu_sub(*lock->readers, !ret);
-+		preempt_enable();
-+
-+		if (!ret && (old & SIX_LOCK_WAITING_write))
-+			ret = -1 - SIX_LOCK_write;
-+	} else if (type == SIX_LOCK_write && lock->readers) {
-+		if (try) {
-+			atomic_add(SIX_LOCK_HELD_write, &lock->state);
-+			smp_mb__after_atomic();
-+		}
-+
-+		ret = !pcpu_read_count(lock);
-+
-+		if (try && !ret) {
-+			old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
-+			if (old & SIX_LOCK_WAITING_read)
-+				ret = -1 - SIX_LOCK_read;
-+		}
-+	} else {
-+		old = atomic_read(&lock->state);
-+		do {
-+			ret = !(old & l[type].lock_fail);
-+			if (!ret || (type == SIX_LOCK_write && !try)) {
-+				smp_mb();
-+				break;
-+			}
-+		} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
-+
-+		EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
-+	}
-+
-+	if (ret > 0)
-+		six_set_owner(lock, type, old, task);
-+
-+	EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
-+		(atomic_read(&lock->state) & SIX_LOCK_HELD_write));
-+
-+	return ret;
-+}
-+
-+static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
-+{
-+	struct six_lock_waiter *w, *next;
-+	struct task_struct *task;
-+	bool saw_one;
-+	int ret;
-+again:
-+	ret = 0;
-+	saw_one = false;
-+	raw_spin_lock(&lock->wait_lock);
-+
-+	list_for_each_entry_safe(w, next, &lock->wait_list, list) {
-+		if (w->lock_want != lock_type)
-+			continue;
-+
-+		if (saw_one && lock_type != SIX_LOCK_read)
-+			goto unlock;
-+		saw_one = true;
-+
-+		ret = __do_six_trylock(lock, lock_type, w->task, false);
-+		if (ret <= 0)
-+			goto unlock;
-+
-+		__list_del(w->list.prev, w->list.next);
-+		task = w->task;
-+		/*
-+		 * Do no writes to @w besides setting lock_acquired - otherwise
-+		 * we would need a memory barrier:
-+		 */
-+		barrier();
-+		w->lock_acquired = true;
-+		wake_up_process(task);
-+	}
-+
-+	six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
-+unlock:
-+	raw_spin_unlock(&lock->wait_lock);
-+
-+	if (ret < 0) {
-+		lock_type = -ret - 1;
-+		goto again;
-+	}
-+}
-+
-+__always_inline
-+static void six_lock_wakeup(struct six_lock *lock, u32 state,
-+			    enum six_lock_type lock_type)
-+{
-+	if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
-+		return;
-+
-+	if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
-+		return;
-+
-+	__six_lock_wakeup(lock, lock_type);
-+}
-+
-+__always_inline
-+static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
-+{
-+	int ret;
-+
-+	ret = __do_six_trylock(lock, type, current, try);
-+	if (ret < 0)
-+		__six_lock_wakeup(lock, -ret - 1);
-+
-+	return ret > 0;
-+}
-+
-+/**
-+ * six_trylock_ip - attempt to take a six lock without blocking
-+ * @lock:	lock to take
-+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
-+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
-+ *
-+ * Return: true on success, false on failure.
-+ */
-+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
-+{
-+	if (!do_six_trylock(lock, type, true))
-+		return false;
-+
-+	if (type != SIX_LOCK_write)
-+		six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
-+	return true;
-+}
-+EXPORT_SYMBOL_GPL(six_trylock_ip);
-+
-+/**
-+ * six_relock_ip - attempt to re-take a lock that was held previously
-+ * @lock:	lock to take
-+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
-+ * @seq:	lock sequence number obtained from six_lock_seq() while lock was
-+ *		held previously
-+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
-+ *
-+ * Return: true on success, false on failure.
-+ */
-+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
-+		   unsigned seq, unsigned long ip)
-+{
-+	if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
-+		return false;
-+
-+	if (six_lock_seq(lock) != seq) {
-+		six_unlock_ip(lock, type, ip);
-+		return false;
-+	}
-+
-+	return true;
-+}
-+EXPORT_SYMBOL_GPL(six_relock_ip);
-+
-+#ifdef CONFIG_LOCK_SPIN_ON_OWNER
-+
-+static inline bool six_can_spin_on_owner(struct six_lock *lock)
-+{
-+	struct task_struct *owner;
-+	bool ret;
-+
-+	if (need_resched())
-+		return false;
-+
-+	rcu_read_lock();
-+	owner = READ_ONCE(lock->owner);
-+	ret = !owner || owner_on_cpu(owner);
-+	rcu_read_unlock();
-+
-+	return ret;
-+}
-+
-+static inline bool six_spin_on_owner(struct six_lock *lock,
-+				     struct task_struct *owner,
-+				     u64 end_time)
-+{
-+	bool ret = true;
-+	unsigned loop = 0;
-+
-+	rcu_read_lock();
-+	while (lock->owner == owner) {
-+		/*
-+		 * Ensure we emit the owner->on_cpu, dereference _after_
-+		 * checking lock->owner still matches owner. If that fails,
-+		 * owner might point to freed memory. If it still matches,
-+		 * the rcu_read_lock() ensures the memory stays valid.
-+		 */
-+		barrier();
-+
-+		if (!owner_on_cpu(owner) || need_resched()) {
-+			ret = false;
-+			break;
-+		}
-+
-+		if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
-+			six_set_bitmask(lock, SIX_LOCK_NOSPIN);
-+			ret = false;
-+			break;
-+		}
-+
-+		cpu_relax();
-+	}
-+	rcu_read_unlock();
-+
-+	return ret;
-+}
-+
-+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
-+{
-+	struct task_struct *task = current;
-+	u64 end_time;
-+
-+	if (type == SIX_LOCK_write)
-+		return false;
-+
-+	preempt_disable();
-+	if (!six_can_spin_on_owner(lock))
-+		goto fail;
-+
-+	if (!osq_lock(&lock->osq))
-+		goto fail;
-+
-+	end_time = sched_clock() + 10 * NSEC_PER_USEC;
-+
-+	while (1) {
-+		struct task_struct *owner;
-+
-+		/*
-+		 * If there's an owner, wait for it to either
-+		 * release the lock or go to sleep.
-+		 */
-+		owner = READ_ONCE(lock->owner);
-+		if (owner && !six_spin_on_owner(lock, owner, end_time))
-+			break;
-+
-+		if (do_six_trylock(lock, type, false)) {
-+			osq_unlock(&lock->osq);
-+			preempt_enable();
-+			return true;
-+		}
-+
-+		/*
-+		 * When there's no owner, we might have preempted between the
-+		 * owner acquiring the lock and setting the owner field. If
-+		 * we're an RT task that will live-lock because we won't let
-+		 * the owner complete.
-+		 */
-+		if (!owner && (need_resched() || rt_task(task)))
-+			break;
-+
-+		/*
-+		 * The cpu_relax() call is a compiler barrier which forces
-+		 * everything in this loop to be re-loaded. We don't need
-+		 * memory barriers as we'll eventually observe the right
-+		 * values at the cost of a few extra spins.
-+		 */
-+		cpu_relax();
-+	}
-+
-+	osq_unlock(&lock->osq);
-+fail:
-+	preempt_enable();
-+
-+	/*
-+	 * If we fell out of the spin path because of need_resched(),
-+	 * reschedule now, before we try-lock again. This avoids getting
-+	 * scheduled out right after we obtained the lock.
-+	 */
-+	if (need_resched())
-+		schedule();
-+
-+	return false;
-+}
-+
-+#else /* CONFIG_LOCK_SPIN_ON_OWNER */
-+
-+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
-+{
-+	return false;
-+}
-+
-+#endif
-+
-+noinline
-+static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
-+			     struct six_lock_waiter *wait,
-+			     six_lock_should_sleep_fn should_sleep_fn, void *p,
-+			     unsigned long ip)
-+{
-+	int ret = 0;
-+
-+	if (type == SIX_LOCK_write) {
-+		EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
-+		atomic_add(SIX_LOCK_HELD_write, &lock->state);
-+		smp_mb__after_atomic();
-+	}
-+
-+	trace_contention_begin(lock, 0);
-+	lock_contended(&lock->dep_map, ip);
-+
-+	if (six_optimistic_spin(lock, type))
-+		goto out;
-+
-+	wait->task		= current;
-+	wait->lock_want		= type;
-+	wait->lock_acquired	= false;
-+
-+	raw_spin_lock(&lock->wait_lock);
-+	six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
-+	/*
-+	 * Retry taking the lock after taking waitlist lock, in case we raced
-+	 * with an unlock:
-+	 */
-+	ret = __do_six_trylock(lock, type, current, false);
-+	if (ret <= 0) {
-+		wait->start_time = local_clock();
-+
-+		if (!list_empty(&lock->wait_list)) {
-+			struct six_lock_waiter *last =
-+				list_last_entry(&lock->wait_list,
-+					struct six_lock_waiter, list);
-+
-+			if (time_before_eq64(wait->start_time, last->start_time))
-+				wait->start_time = last->start_time + 1;
-+		}
-+
-+		list_add_tail(&wait->list, &lock->wait_list);
-+	}
-+	raw_spin_unlock(&lock->wait_lock);
-+
-+	if (unlikely(ret > 0)) {
-+		ret = 0;
-+		goto out;
-+	}
-+
-+	if (unlikely(ret < 0)) {
-+		__six_lock_wakeup(lock, -ret - 1);
-+		ret = 0;
-+	}
-+
-+	while (1) {
-+		set_current_state(TASK_UNINTERRUPTIBLE);
-+
-+		if (wait->lock_acquired)
-+			break;
-+
-+		ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
-+		if (unlikely(ret)) {
-+			raw_spin_lock(&lock->wait_lock);
-+			if (!wait->lock_acquired)
-+				list_del(&wait->list);
-+			raw_spin_unlock(&lock->wait_lock);
-+
-+			if (unlikely(wait->lock_acquired))
-+				do_six_unlock_type(lock, type);
-+			break;
-+		}
-+
-+		schedule();
-+	}
-+
-+	__set_current_state(TASK_RUNNING);
-+out:
-+	if (ret && type == SIX_LOCK_write) {
-+		six_clear_bitmask(lock, SIX_LOCK_HELD_write);
-+		six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
-+	}
-+	trace_contention_end(lock, 0);
-+
-+	return ret;
-+}
-+
-+/**
-+ * six_lock_ip_waiter - take a lock, with full waitlist interface
-+ * @lock:	lock to take
-+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
-+ * @wait:	pointer to wait object, which will be added to lock's waitlist
-+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
-+ *		to scheduling
-+ * @p:		passed through to @should_sleep_fn
-+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
-+ *
-+ * This is the most general six_lock() variant, with parameters to support full
-+ * cycle detection for deadlock avoidance.
-+ *
-+ * The code calling this function must implement tracking of held locks, and the
-+ * @wait object should be embedded into the struct that tracks held locks -
-+ * which must also be accessible in a thread-safe way.
-+ *
-+ * @should_sleep_fn should invoke the cycle detector; it should walk each
-+ * lock's waiters, and for each waiter recursively walk their held locks.
-+ *
-+ * When this function must block, @wait will be added to @lock's waitlist before
-+ * calling trylock, and before calling @should_sleep_fn, and @wait will not be
-+ * removed from the lock waitlist until the lock has been successfully acquired,
-+ * or we abort.
-+ *
-+ * @wait.start_time will be monotonically increasing for any given waitlist, and
-+ * thus may be used as a loop cursor.
-+ *
-+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
-+ */
-+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
-+		       struct six_lock_waiter *wait,
-+		       six_lock_should_sleep_fn should_sleep_fn, void *p,
-+		       unsigned long ip)
-+{
-+	int ret;
-+
-+	wait->start_time = 0;
-+
-+	if (type != SIX_LOCK_write)
-+		six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
-+
-+	ret = do_six_trylock(lock, type, true) ? 0
-+		: six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
-+
-+	if (ret && type != SIX_LOCK_write)
-+		six_release(&lock->dep_map, ip);
-+	if (!ret)
-+		lock_acquired(&lock->dep_map, ip);
-+
-+	return ret;
-+}
-+EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
-+
-+__always_inline
-+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-+{
-+	u32 state;
-+
-+	if (type == SIX_LOCK_intent)
-+		lock->owner = NULL;
-+
-+	if (type == SIX_LOCK_read &&
-+	    lock->readers) {
-+		smp_mb(); /* unlock barrier */
-+		this_cpu_dec(*lock->readers);
-+		smp_mb(); /* between unlocking and checking for waiters */
-+		state = atomic_read(&lock->state);
-+	} else {
-+		u32 v = l[type].lock_val;
-+
-+		if (type != SIX_LOCK_read)
-+			v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
-+
-+		EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
-+		state = atomic_sub_return_release(v, &lock->state);
-+	}
-+
-+	six_lock_wakeup(lock, state, l[type].unlock_wakeup);
-+}
-+
-+/**
-+ * six_unlock_ip - drop a six lock
-+ * @lock:	lock to unlock
-+ * @type:	SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
-+ * @ip:		ip parameter for lockdep/lockstat, i.e. _THIS_IP_
-+ *
-+ * When a lock is held multiple times (because six_lock_incement()) was used),
-+ * this decrements the 'lock held' counter by one.
-+ *
-+ * For example:
-+ * six_lock_read(&foo->lock);				read count 1
-+ * six_lock_increment(&foo->lock, SIX_LOCK_read);	read count 2
-+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 1
-+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);		read count 0
-+ */
-+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
-+{
-+	EBUG_ON(type == SIX_LOCK_write &&
-+		!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
-+	EBUG_ON((type == SIX_LOCK_write ||
-+		 type == SIX_LOCK_intent) &&
-+		lock->owner != current);
-+
-+	if (type != SIX_LOCK_write)
-+		six_release(&lock->dep_map, ip);
-+	else
-+		lock->seq++;
-+
-+	if (type == SIX_LOCK_intent &&
-+	    lock->intent_lock_recurse) {
-+		--lock->intent_lock_recurse;
-+		return;
-+	}
-+
-+	do_six_unlock_type(lock, type);
-+}
-+EXPORT_SYMBOL_GPL(six_unlock_ip);
-+
-+/**
-+ * six_lock_downgrade - convert an intent lock to a read lock
-+ * @lock:	lock to dowgrade
-+ *
-+ * @lock will have read count incremented and intent count decremented
-+ */
-+void six_lock_downgrade(struct six_lock *lock)
-+{
-+	six_lock_increment(lock, SIX_LOCK_read);
-+	six_unlock_intent(lock);
-+}
-+EXPORT_SYMBOL_GPL(six_lock_downgrade);
-+
-+/**
-+ * six_lock_tryupgrade - attempt to convert read lock to an intent lock
-+ * @lock:	lock to upgrade
-+ *
-+ * On success, @lock will have intent count incremented and read count
-+ * decremented
-+ *
-+ * Return: true on success, false on failure
-+ */
-+bool six_lock_tryupgrade(struct six_lock *lock)
-+{
-+	u32 old = atomic_read(&lock->state), new;
-+
-+	do {
-+		new = old;
-+
-+		if (new & SIX_LOCK_HELD_intent)
-+			return false;
-+
-+		if (!lock->readers) {
-+			EBUG_ON(!(new & SIX_LOCK_HELD_read));
-+			new -= l[SIX_LOCK_read].lock_val;
-+		}
-+
-+		new |= SIX_LOCK_HELD_intent;
-+	} while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
-+
-+	if (lock->readers)
-+		this_cpu_dec(*lock->readers);
-+
-+	six_set_owner(lock, SIX_LOCK_intent, old, current);
-+
-+	return true;
-+}
-+EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
-+
-+/**
-+ * six_trylock_convert - attempt to convert a held lock from one type to another
-+ * @lock:	lock to upgrade
-+ * @from:	SIX_LOCK_read or SIX_LOCK_intent
-+ * @to:		SIX_LOCK_read or SIX_LOCK_intent
-+ *
-+ * On success, @lock will have intent count incremented and read count
-+ * decremented
-+ *
-+ * Return: true on success, false on failure
-+ */
-+bool six_trylock_convert(struct six_lock *lock,
-+			 enum six_lock_type from,
-+			 enum six_lock_type to)
-+{
-+	EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
-+
-+	if (to == from)
-+		return true;
-+
-+	if (to == SIX_LOCK_read) {
-+		six_lock_downgrade(lock);
-+		return true;
-+	} else {
-+		return six_lock_tryupgrade(lock);
-+	}
-+}
-+EXPORT_SYMBOL_GPL(six_trylock_convert);
-+
-+/**
-+ * six_lock_increment - increase held lock count on a lock that is already held
-+ * @lock:	lock to increment
-+ * @type:	SIX_LOCK_read or SIX_LOCK_intent
-+ *
-+ * @lock must already be held, with a lock type that is greater than or equal to
-+ * @type
-+ *
-+ * A corresponding six_unlock_type() call will be required for @lock to be fully
-+ * unlocked.
-+ */
-+void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
-+{
-+	six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
-+
-+	/* XXX: assert already locked, and that we don't overflow: */
-+
-+	switch (type) {
-+	case SIX_LOCK_read:
-+		if (lock->readers) {
-+			this_cpu_inc(*lock->readers);
-+		} else {
-+			EBUG_ON(!(atomic_read(&lock->state) &
-+				  (SIX_LOCK_HELD_read|
-+				   SIX_LOCK_HELD_intent)));
-+			atomic_add(l[type].lock_val, &lock->state);
-+		}
-+		break;
-+	case SIX_LOCK_intent:
-+		EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
-+		lock->intent_lock_recurse++;
-+		break;
-+	case SIX_LOCK_write:
-+		BUG();
-+		break;
-+	}
-+}
-+EXPORT_SYMBOL_GPL(six_lock_increment);
-+
-+/**
-+ * six_lock_wakeup_all - wake up all waiters on @lock
-+ * @lock:	lock to wake up waiters for
-+ *
-+ * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
-+ * abort the lock operation.
-+ *
-+ * This function is never needed in a bug-free program; it's only useful in
-+ * debug code, e.g. to determine if a cycle detector is at fault.
-+ */
-+void six_lock_wakeup_all(struct six_lock *lock)
-+{
-+	u32 state = atomic_read(&lock->state);
-+	struct six_lock_waiter *w;
-+
-+	six_lock_wakeup(lock, state, SIX_LOCK_read);
-+	six_lock_wakeup(lock, state, SIX_LOCK_intent);
-+	six_lock_wakeup(lock, state, SIX_LOCK_write);
-+
-+	raw_spin_lock(&lock->wait_lock);
-+	list_for_each_entry(w, &lock->wait_list, list)
-+		wake_up_process(w->task);
-+	raw_spin_unlock(&lock->wait_lock);
-+}
-+EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
-+
-+/**
-+ * six_lock_counts - return held lock counts, for each lock type
-+ * @lock:	lock to return counters for
-+ *
-+ * Return: the number of times a lock is held for read, intent and write.
-+ */
-+struct six_lock_count six_lock_counts(struct six_lock *lock)
-+{
-+	struct six_lock_count ret;
-+
-+	ret.n[SIX_LOCK_read]	= !lock->readers
-+		? atomic_read(&lock->state) & SIX_LOCK_HELD_read
-+		: pcpu_read_count(lock);
-+	ret.n[SIX_LOCK_intent]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
-+		lock->intent_lock_recurse;
-+	ret.n[SIX_LOCK_write]	= !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
-+
-+	return ret;
-+}
-+EXPORT_SYMBOL_GPL(six_lock_counts);
-+
-+/**
-+ * six_lock_readers_add - directly manipulate reader count of a lock
-+ * @lock:	lock to add/subtract readers for
-+ * @nr:		reader count to add/subtract
-+ *
-+ * When an upper layer is implementing lock reentrency, we may have both read
-+ * and intent locks on the same lock.
-+ *
-+ * When we need to take a write lock, the read locks will cause self-deadlock,
-+ * because six locks themselves do not track which read locks are held by the
-+ * current thread and which are held by a different thread - it does no
-+ * per-thread tracking of held locks.
-+ *
-+ * The upper layer that is tracking held locks may however, if trylock() has
-+ * failed, count up its own read locks, subtract them, take the write lock, and
-+ * then re-add them.
-+ *
-+ * As in any other situation when taking a write lock, @lock must be held for
-+ * intent one (or more) times, so @lock will never be left unlocked.
-+ */
-+void six_lock_readers_add(struct six_lock *lock, int nr)
-+{
-+	if (lock->readers) {
-+		this_cpu_add(*lock->readers, nr);
-+	} else {
-+		EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
-+		/* reader count starts at bit 0 */
-+		atomic_add(nr, &lock->state);
-+	}
-+}
-+EXPORT_SYMBOL_GPL(six_lock_readers_add);
-+
-+/**
-+ * six_lock_exit - release resources held by a lock prior to freeing
-+ * @lock:	lock to exit
-+ *
-+ * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
-+ * required to free the percpu read counts.
-+ */
-+void six_lock_exit(struct six_lock *lock)
-+{
-+	WARN_ON(lock->readers && pcpu_read_count(lock));
-+	WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
-+
-+	free_percpu(lock->readers);
-+	lock->readers = NULL;
-+}
-+EXPORT_SYMBOL_GPL(six_lock_exit);
-+
-+void __six_lock_init(struct six_lock *lock, const char *name,
-+		     struct lock_class_key *key, enum six_lock_init_flags flags)
-+{
-+	atomic_set(&lock->state, 0);
-+	raw_spin_lock_init(&lock->wait_lock);
-+	INIT_LIST_HEAD(&lock->wait_list);
-+#ifdef CONFIG_DEBUG_LOCK_ALLOC
-+	debug_check_no_locks_freed((void *) lock, sizeof(*lock));
-+	lockdep_init_map(&lock->dep_map, name, key, 0);
-+#endif
-+
-+	/*
-+	 * Don't assume that we have real percpu variables available in
-+	 * userspace:
-+	 */
-+#ifdef __KERNEL__
-+	if (flags & SIX_LOCK_INIT_PCPU) {
-+		/*
-+		 * We don't return an error here on memory allocation failure
-+		 * since percpu is an optimization, and locks will work with the
-+		 * same semantics in non-percpu mode: callers can check for
-+		 * failure if they wish by checking lock->readers, but generally
-+		 * will not want to treat it as an error.
-+		 */
-+		lock->readers = alloc_percpu(unsigned);
-+	}
-+#endif
-+}
-+EXPORT_SYMBOL_GPL(__six_lock_init);
-diff --git a/kernel/module/main.c b/kernel/module/main.c
-index 4e2cf784c..7f7b5bedf 100644
---- a/kernel/module/main.c
-+++ b/kernel/module/main.c
-@@ -56,6 +56,7 @@
- #include <linux/dynamic_debug.h>
- #include <linux/audit.h>
- #include <linux/cfi.h>
-+#include <linux/codetag.h>
- #include <linux/debugfs.h>
- #include <uapi/linux/module.h>
- #include "internal.h"
-@@ -1217,15 +1218,19 @@ static void *module_memory_alloc(unsigned int size, enum mod_mem_type type)
- 	return module_alloc(size);
- }
- 
--static void module_memory_free(void *ptr, enum mod_mem_type type)
-+static void module_memory_free(void *ptr, enum mod_mem_type type,
-+			       bool unload_codetags)
- {
-+	if (!unload_codetags && mod_mem_type_is_core_data(type))
-+		return;
-+
- 	if (mod_mem_use_vmalloc(type))
- 		vfree(ptr);
- 	else
- 		module_memfree(ptr);
- }
- 
--static void free_mod_mem(struct module *mod)
-+static void free_mod_mem(struct module *mod, bool unload_codetags)
- {
- 	for_each_mod_mem_type(type) {
- 		struct module_memory *mod_mem = &mod->mem[type];
-@@ -1236,19 +1241,23 @@ static void free_mod_mem(struct module *mod)
- 		/* Free lock-classes; relies on the preceding sync_rcu(). */
- 		lockdep_free_key_range(mod_mem->base, mod_mem->size);
- 		if (mod_mem->size)
--			module_memory_free(mod_mem->base, type);
-+			module_memory_free(mod_mem->base, type,
-+					   unload_codetags);
- 	}
- 
- 	/* MOD_DATA hosts mod, so free it at last */
- 	lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
--	module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA);
-+	module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA, unload_codetags);
- }
- 
- /* Free a module, remove from lists, etc. */
- static void free_module(struct module *mod)
- {
-+	bool unload_codetags;
-+
- 	trace_module_free(mod);
- 
-+	unload_codetags = codetag_unload_module(mod);
- 	mod_sysfs_teardown(mod);
- 
- 	/*
-@@ -1290,7 +1299,7 @@ static void free_module(struct module *mod)
- 	kfree(mod->args);
- 	percpu_modfree(mod);
- 
--	free_mod_mem(mod);
-+	free_mod_mem(mod, unload_codetags);
- }
- 
- void *__symbol_get(const char *symbol)
-@@ -2292,7 +2301,7 @@ static int move_module(struct module *mod, struct load_info *info)
- 	return 0;
- out_enomem:
- 	for (t--; t >= 0; t--)
--		module_memory_free(mod->mem[t].base, t);
-+		module_memory_free(mod->mem[t].base, t, true);
- 	return ret;
- }
- 
-@@ -2422,7 +2431,7 @@ static void module_deallocate(struct module *mod, struct load_info *info)
- 	percpu_modfree(mod);
- 	module_arch_freeing_init(mod);
- 
--	free_mod_mem(mod);
-+	free_mod_mem(mod, true);
- }
- 
- int __weak module_finalize(const Elf_Ehdr *hdr,
-@@ -2974,6 +2983,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
- 	/* Get rid of temporary copy. */
- 	free_copy(info, flags);
- 
-+	codetag_load_module(mod);
-+
- 	/* Done! */
- 	trace_module_load(mod);
- 
 diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
 index 9ed5ce989..4f6582487 100644
 --- a/kernel/stacktrace.c
@@ -98479,47 +97496,10 @@ index 5c2da561c..f78bc8b42 100644
  	bool
  	depends on !NO_IOMEM
 diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
-index ce51d4dc6..a19ec6fd7 100644
+index d6798513a..69a3e33d1 100644
 --- a/lib/Kconfig.debug
 +++ b/lib/Kconfig.debug
-@@ -957,6 +957,36 @@ config DEBUG_STACKOVERFLOW
- 
- 	  If in doubt, say "N".
- 
-+config CODE_TAGGING
-+	bool
-+	select KALLSYMS
-+
-+config MEM_ALLOC_PROFILING
-+	bool "Enable memory allocation profiling"
-+	default n
-+	depends on PROC_FS
-+	select CODE_TAGGING
-+	select PAGE_EXTENSION
-+	select SLAB_OBJ_EXT
-+	help
-+	  Track allocation source code and record total allocation size
-+	  initiated at that code location. The mechanism can be used to track
-+	  memory leaks with a low performance and memory impact.
-+
-+config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
-+	bool "Enable memory allocation profiling by default"
-+	default y
-+	depends on MEM_ALLOC_PROFILING
-+
-+config MEM_ALLOC_PROFILING_DEBUG
-+	bool "Memory allocation profiler debugging"
-+	default n
-+	depends on MEM_ALLOC_PROFILING
-+	select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
-+	help
-+	  Adds warnings with helpful error messages for memory allocation
-+	  profiling.
-+
- source "lib/Kconfig.kasan"
- source "lib/Kconfig.kfence"
- source "lib/Kconfig.kmsan"
-@@ -1637,6 +1667,15 @@ config DEBUG_NOTIFIERS
+@@ -1710,6 +1710,15 @@ config DEBUG_NOTIFIERS
  	  This is a relatively cheap check but if you care about maximum
  	  performance, say N.
  
@@ -98535,20 +97515,7 @@ index ce51d4dc6..a19ec6fd7 100644
  config BUG_ON_DATA_CORRUPTION
  	bool "Trigger a BUG when data corruption is detected"
  	select DEBUG_LIST
-@@ -1997,6 +2036,12 @@ config FAULT_INJECTION_STACKTRACE_FILTER
- 	help
- 	  Provide stacktrace filter for fault-injection capabilities
- 
-+config CODETAG_FAULT_INJECTION
-+	bool "Code tagging based fault injection"
-+	select CODE_TAGGING
-+	help
-+	  Dynamic fault injection based on code tagging
-+
- config ARCH_HAS_KCOV
- 	bool
- 	help
-@@ -2123,6 +2168,15 @@ config CPUMASK_KUNIT_TEST
+@@ -2196,6 +2205,15 @@ config CPUMASK_KUNIT_TEST
  
  	  If unsure, say N.
  
@@ -98565,31 +97532,10 @@ index ce51d4dc6..a19ec6fd7 100644
  	tristate "Linked list sorting test" if !KUNIT_ALL_TESTS
  	depends on KUNIT
 diff --git a/lib/Makefile b/lib/Makefile
-index 876fcdeae..fb1d20939 100644
+index 1ffae65bb..5ac5d72ba 100644
 --- a/lib/Makefile
 +++ b/lib/Makefile
-@@ -30,7 +30,7 @@ endif
- lib-y := ctype.o string.o vsprintf.o cmdline.o \
- 	 rbtree.o radix-tree.o timerqueue.o xarray.o \
- 	 maple_tree.o idr.o extable.o irq_regs.o argv_split.o \
--	 flex_proportions.o ratelimit.o show_mem.o \
-+	 flex_proportions.o ratelimit.o \
- 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
- 	 earlycpio.o seq_buf.o siphash.o dec_and_lock.o \
- 	 nmi_backtrace.o win_minmax.o memcat_p.o \
-@@ -226,6 +226,11 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \
- 	of-reconfig-notifier-error-inject.o
- obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
- 
-+obj-$(CONFIG_CODE_TAGGING) += codetag.o
-+obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o
-+
-+obj-$(CONFIG_CODETAG_FAULT_INJECTION) += dynamic_fault.o
-+
- lib-$(CONFIG_GENERIC_BUG) += bug.o
- 
- obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
-@@ -248,6 +253,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
+@@ -254,6 +254,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
  
  obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
  
@@ -98598,245 +97544,14 @@ index 876fcdeae..fb1d20939 100644
  obj-$(CONFIG_DQL) += dynamic_queue_limits.o
  
  obj-$(CONFIG_GLOB) += glob.o
-diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
-new file mode 100644
-index 000000000..1ca90cff5
---- /dev/null
-+++ b/lib/alloc_tag.c
-@@ -0,0 +1,225 @@
-+// SPDX-License-Identifier: GPL-2.0-only
-+#include <linux/alloc_tag.h>
-+#include <linux/fs.h>
-+#include <linux/gfp.h>
-+#include <linux/module.h>
-+#include <linux/page_ext.h>
-+#include <linux/proc_fs.h>
-+#include <linux/seq_buf.h>
-+#include <linux/seq_file.h>
-+
-+static struct codetag_type *alloc_tag_cttype;
-+
-+DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
-+			mem_alloc_profiling_key);
-+
-+static void *allocinfo_start(struct seq_file *m, loff_t *pos)
-+{
-+	struct codetag_iterator *iter;
-+	struct codetag *ct;
-+	loff_t node = *pos;
-+
-+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-+	m->private = iter;
-+	if (!iter)
-+		return NULL;
-+
-+	codetag_lock_module_list(alloc_tag_cttype, true);
-+	*iter = codetag_get_ct_iter(alloc_tag_cttype);
-+	while ((ct = codetag_next_ct(iter)) != NULL && node)
-+		node--;
-+
-+	return ct ? iter : NULL;
-+}
-+
-+static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
-+{
-+	struct codetag_iterator *iter = (struct codetag_iterator *)arg;
-+	struct codetag *ct = codetag_next_ct(iter);
-+
-+	(*pos)++;
-+	if (!ct)
-+		return NULL;
-+
-+	return iter;
-+}
-+
-+static void allocinfo_stop(struct seq_file *m, void *arg)
-+{
-+	struct codetag_iterator *iter = (struct codetag_iterator *)m->private;
-+
-+	if (iter) {
-+		codetag_lock_module_list(alloc_tag_cttype, false);
-+		kfree(iter);
-+	}
-+}
-+
-+static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
-+{
-+	struct alloc_tag *tag = ct_to_alloc_tag(ct);
-+	s64 bytes = alloc_tag_read(tag);
-+	char val[10], *p = val;
-+
-+	if (bytes < 0) {
-+		*p++ = '-';
-+		bytes = -bytes;
-+	}
-+
-+	string_get_size(bytes, 1,
-+			STRING_SIZE_BASE2|STRING_SIZE_NOSPACE,
-+			p, val + ARRAY_SIZE(val) - p);
-+
-+	seq_buf_printf(out, "%8s ", val);
-+	codetag_to_text(out, ct);
-+	seq_buf_putc(out, ' ');
-+	seq_buf_putc(out, '\n');
-+}
-+
-+static int allocinfo_show(struct seq_file *m, void *arg)
-+{
-+	struct codetag_iterator *iter = (struct codetag_iterator *)arg;
-+	char *bufp;
-+	size_t n = seq_get_buf(m, &bufp);
-+	struct seq_buf buf;
-+
-+	seq_buf_init(&buf, bufp, n);
-+	alloc_tag_to_text(&buf, iter->ct);
-+	seq_commit(m, seq_buf_used(&buf));
-+	return 0;
-+}
-+
-+static const struct seq_operations allocinfo_seq_op = {
-+	.start	= allocinfo_start,
-+	.next	= allocinfo_next,
-+	.stop	= allocinfo_stop,
-+	.show	= allocinfo_show,
-+};
-+
-+void alloc_tags_show_mem_report(struct seq_buf *s)
-+{
-+	struct codetag_iterator iter;
-+	struct codetag *ct;
-+	struct {
-+		struct codetag		*tag;
-+		size_t			bytes;
-+	} tags[10], n;
-+	unsigned int i, nr = 0;
-+
-+	codetag_lock_module_list(alloc_tag_cttype, true);
-+	iter = codetag_get_ct_iter(alloc_tag_cttype);
-+	while ((ct = codetag_next_ct(&iter))) {
-+		n.tag	= ct;
-+		n.bytes = alloc_tag_read(ct_to_alloc_tag(ct));
-+
-+		for (i = 0; i < nr; i++)
-+			if (n.bytes > tags[i].bytes)
-+				break;
-+
-+		if (i < ARRAY_SIZE(tags)) {
-+			nr -= nr == ARRAY_SIZE(tags);
-+			memmove(&tags[i + 1],
-+				&tags[i],
-+				sizeof(tags[0]) * (nr - i));
-+			nr++;
-+			tags[i] = n;
-+		}
-+	}
-+
-+	for (i = 0; i < nr; i++)
-+		alloc_tag_to_text(s, tags[i].tag);
-+
-+	codetag_lock_module_list(alloc_tag_cttype, false);
-+}
-+
-+static void __init procfs_init(void)
-+{
-+	proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op);
-+}
-+
-+static void alloc_tag_module_load(struct codetag_type *cttype, struct codetag_module *cmod)
-+{
-+	struct codetag_iterator iter = codetag_get_ct_iter(cttype);
-+	struct codetag *ct;
-+
-+	for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
-+		if (iter.cmod != cmod)
-+			continue;
-+
-+		ct_to_alloc_tag(ct)->bytes_allocated = alloc_percpu(u64);
-+	}
-+}
-+
-+static bool alloc_tag_module_unload(struct codetag_type *cttype, struct codetag_module *cmod)
-+{
-+	struct codetag_iterator iter = codetag_get_ct_iter(cttype);
-+	bool module_unused = true;
-+	struct alloc_tag *tag;
-+	struct codetag *ct;
-+	size_t bytes;
-+
-+	for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
-+		if (iter.cmod != cmod)
-+			continue;
-+
-+		tag = ct_to_alloc_tag(ct);
-+		bytes = alloc_tag_read(tag);
-+
-+		if (!WARN(bytes, "%s:%u module %s func:%s has %zu allocated at module unload",
-+			  ct->filename, ct->lineno, ct->modname, ct->function, bytes))
-+			free_percpu(tag->bytes_allocated);
-+		else
-+			module_unused = false;
-+	}
-+
-+	return module_unused;
-+}
-+
-+static __init bool need_page_alloc_tagging(void)
-+{
-+	return true;
-+}
-+
-+static __init void init_page_alloc_tagging(void)
-+{
-+}
-+
-+struct page_ext_operations page_alloc_tagging_ops = {
-+	.size = sizeof(union codetag_ref),
-+	.need = need_page_alloc_tagging,
-+	.init = init_page_alloc_tagging,
-+};
-+EXPORT_SYMBOL(page_alloc_tagging_ops);
-+
-+static struct ctl_table memory_allocation_profiling_sysctls[] = {
-+	{
-+		.procname	= "mem_profiling",
-+		.data		= &mem_alloc_profiling_key,
-+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
-+		.mode		= 0444,
-+#else
-+		.mode		= 0644,
-+#endif
-+		.proc_handler	= proc_do_static_key,
-+	},
-+	{ }
-+};
-+
-+static int __init alloc_tag_init(void)
-+{
-+	const struct codetag_type_desc desc = {
-+		.section	= "alloc_tags",
-+		.tag_size	= sizeof(struct alloc_tag),
-+		.module_load	= alloc_tag_module_load,
-+		.module_unload	= alloc_tag_module_unload,
-+	};
-+
-+	alloc_tag_cttype = codetag_register_type(&desc);
-+	if (IS_ERR_OR_NULL(alloc_tag_cttype))
-+		return PTR_ERR(alloc_tag_cttype);
-+
-+	register_sysctl_init("vm", memory_allocation_profiling_sysctls);
-+	procfs_init();
-+
-+	return 0;
-+}
-+module_init(alloc_tag_init);
 diff --git a/drivers/md/bcache/closure.c b/lib/closure.c
-similarity index 88%
+similarity index 85%
 rename from drivers/md/bcache/closure.c
 rename to lib/closure.c
-index d8d9394a6..0855e698c 100644
+index d8d9394a6..2958169ce 100644
 --- a/drivers/md/bcache/closure.c
 +++ b/lib/closure.c
-@@ -6,13 +6,13 @@
+@@ -6,19 +6,20 @@
   * Copyright 2012 Google, Inc.
   */
  
@@ -98853,7 +97568,16 @@ index d8d9394a6..0855e698c 100644
  static inline void closure_put_after_sub(struct closure *cl, int flags)
  {
  	int r = flags & CLOSURE_REMAINING_MASK;
-@@ -45,6 +45,7 @@ void closure_sub(struct closure *cl, int v)
+ 
+-	BUG_ON(flags & CLOSURE_GUARD_MASK);
+-	BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
++	if ((flags & CLOSURE_GUARD_MASK) ||
++	    (!r && (flags & ~CLOSURE_DESTRUCTOR)))
++		panic("closure_put_after_sub: bogus flags %x remaining %i", flags, r);
+ 
+ 	if (!r) {
+ 		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
+@@ -45,6 +46,7 @@ void closure_sub(struct closure *cl, int v)
  {
  	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
  }
@@ -98861,7 +97585,7 @@ index d8d9394a6..0855e698c 100644
  
  /*
   * closure_put - decrement a closure's refcount
-@@ -53,6 +54,7 @@ void closure_put(struct closure *cl)
+@@ -53,6 +55,7 @@ void closure_put(struct closure *cl)
  {
  	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
  }
@@ -98869,7 +97593,7 @@ index d8d9394a6..0855e698c 100644
  
  /*
   * closure_wake_up - wake up all closures on a wait list, without memory barrier
-@@ -74,6 +76,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
+@@ -74,6 +77,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list)
  		closure_sub(cl, CLOSURE_WAITING + 1);
  	}
  }
@@ -98877,7 +97601,7 @@ index d8d9394a6..0855e698c 100644
  
  /**
   * closure_wait - add a closure to a waitlist
-@@ -93,6 +96,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
+@@ -93,6 +97,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
  
  	return true;
  }
@@ -98885,7 +97609,7 @@ index d8d9394a6..0855e698c 100644
  
  struct closure_syncer {
  	struct task_struct	*task;
-@@ -127,8 +131,9 @@ void __sched __closure_sync(struct closure *cl)
+@@ -127,8 +132,9 @@ void __sched __closure_sync(struct closure *cl)
  
  	__set_current_state(TASK_RUNNING);
  }
@@ -98896,7 +97620,7 @@ index d8d9394a6..0855e698c 100644
  
  static LIST_HEAD(closure_list);
  static DEFINE_SPINLOCK(closure_list_lock);
-@@ -144,6 +149,7 @@ void closure_debug_create(struct closure *cl)
+@@ -144,6 +150,7 @@ void closure_debug_create(struct closure *cl)
  	list_add(&cl->all, &closure_list);
  	spin_unlock_irqrestore(&closure_list_lock, flags);
  }
@@ -98904,7 +97628,7 @@ index d8d9394a6..0855e698c 100644
  
  void closure_debug_destroy(struct closure *cl)
  {
-@@ -156,8 +162,7 @@ void closure_debug_destroy(struct closure *cl)
+@@ -156,8 +163,7 @@ void closure_debug_destroy(struct closure *cl)
  	list_del(&cl->all);
  	spin_unlock_irqrestore(&closure_list_lock, flags);
  }
@@ -98914,7 +97638,7 @@ index d8d9394a6..0855e698c 100644
  
  static int debug_show(struct seq_file *f, void *data)
  {
-@@ -181,7 +186,7 @@ static int debug_show(struct seq_file *f, void *data)
+@@ -181,7 +187,7 @@ static int debug_show(struct seq_file *f, void *data)
  			seq_printf(f, " W %pS\n",
  				   (void *) cl->waiting_on);
  
@@ -98923,7 +97647,7 @@ index d8d9394a6..0855e698c 100644
  	}
  
  	spin_unlock_irq(&closure_list_lock);
-@@ -190,18 +195,11 @@ static int debug_show(struct seq_file *f, void *data)
+@@ -190,18 +196,11 @@ static int debug_show(struct seq_file *f, void *data)
  
  DEFINE_SHOW_ATTRIBUTE(debug);
  
@@ -98947,782 +97671,6 @@ index d8d9394a6..0855e698c 100644
 -MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
 -MODULE_LICENSE("GPL");
 +#endif
-diff --git a/lib/codetag.c b/lib/codetag.c
-new file mode 100644
-index 000000000..84f90f3b9
---- /dev/null
-+++ b/lib/codetag.c
-@@ -0,0 +1,393 @@
-+// SPDX-License-Identifier: GPL-2.0-only
-+#include <linux/codetag.h>
-+#include <linux/idr.h>
-+#include <linux/kallsyms.h>
-+#include <linux/module.h>
-+#include <linux/seq_buf.h>
-+#include <linux/slab.h>
-+#include <linux/vmalloc.h>
-+
-+struct codetag_type {
-+	struct list_head link;
-+	unsigned int count;
-+	struct idr mod_idr;
-+	struct rw_semaphore mod_lock; /* protects mod_idr */
-+	struct codetag_type_desc desc;
-+};
-+
-+static DEFINE_MUTEX(codetag_lock);
-+static LIST_HEAD(codetag_types);
-+
-+void codetag_lock_module_list(struct codetag_type *cttype, bool lock)
-+{
-+	if (lock)
-+		down_read(&cttype->mod_lock);
-+	else
-+		up_read(&cttype->mod_lock);
-+}
-+
-+struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype)
-+{
-+	struct codetag_iterator iter = {
-+		.cttype = cttype,
-+		.cmod = NULL,
-+		.mod_id = 0,
-+		.ct = NULL,
-+	};
-+
-+	return iter;
-+}
-+
-+static inline struct codetag *get_first_module_ct(struct codetag_module *cmod)
-+{
-+	return cmod->range.start < cmod->range.stop ? cmod->range.start : NULL;
-+}
-+
-+static inline
-+struct codetag *get_next_module_ct(struct codetag_iterator *iter)
-+{
-+	struct codetag *res = (struct codetag *)
-+			((char *)iter->ct + iter->cttype->desc.tag_size);
-+
-+	return res < iter->cmod->range.stop ? res : NULL;
-+}
-+
-+struct codetag *codetag_next_ct(struct codetag_iterator *iter)
-+{
-+	struct codetag_type *cttype = iter->cttype;
-+	struct codetag_module *cmod;
-+	struct codetag *ct;
-+
-+	lockdep_assert_held(&cttype->mod_lock);
-+
-+	if (unlikely(idr_is_empty(&cttype->mod_idr)))
-+		return NULL;
-+
-+	ct = NULL;
-+	while (true) {
-+		cmod = idr_find(&cttype->mod_idr, iter->mod_id);
-+
-+		/* If module was removed move to the next one */
-+		if (!cmod)
-+			cmod = idr_get_next_ul(&cttype->mod_idr,
-+					       &iter->mod_id);
-+
-+		/* Exit if no more modules */
-+		if (!cmod)
-+			break;
-+
-+		if (cmod != iter->cmod) {
-+			iter->cmod = cmod;
-+			ct = get_first_module_ct(cmod);
-+		} else
-+			ct = get_next_module_ct(iter);
-+
-+		if (ct)
-+			break;
-+
-+		iter->mod_id++;
-+	}
-+
-+	iter->ct = ct;
-+	return ct;
-+}
-+
-+void codetag_to_text(struct seq_buf *out, struct codetag *ct)
-+{
-+	seq_buf_printf(out, "%s:%u module:%s func:%s",
-+		       ct->filename, ct->lineno,
-+		       ct->modname, ct->function);
-+}
-+
-+static inline size_t range_size(const struct codetag_type *cttype,
-+				const struct codetag_range *range)
-+{
-+	return ((char *)range->stop - (char *)range->start) /
-+			cttype->desc.tag_size;
-+}
-+
-+static void *get_symbol(struct module *mod, const char *prefix, const char *name)
-+{
-+	char buf[64];
-+	void *ret;
-+	int res;
-+
-+	res = snprintf(buf, sizeof(buf), "%s%s", prefix, name);
-+	if (WARN_ON(res < 1 || res > sizeof(buf)))
-+		return NULL;
-+
-+	preempt_disable();
-+	ret = mod ?
-+		(void *)find_kallsyms_symbol_value(mod, buf) :
-+		(void *)kallsyms_lookup_name(buf);
-+	preempt_enable();
-+
-+	return ret;
-+}
-+
-+static struct codetag_range get_section_range(struct module *mod,
-+					      const char *section)
-+{
-+	return (struct codetag_range) {
-+		get_symbol(mod, "__start_", section),
-+		get_symbol(mod, "__stop_", section),
-+	};
-+}
-+
-+static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
-+{
-+	struct codetag_range range;
-+	struct codetag_module *cmod;
-+	int err;
-+
-+	range = get_section_range(mod, cttype->desc.section);
-+	if (!range.start || !range.stop) {
-+		pr_warn("Failed to load code tags of type %s from the module %s\n",
-+			cttype->desc.section,
-+			mod ? mod->name : "(built-in)");
-+		return -EINVAL;
-+	}
-+
-+	/* Ignore empty ranges */
-+	if (range.start == range.stop)
-+		return 0;
-+
-+	BUG_ON(range.start > range.stop);
-+
-+	cmod = kmalloc(sizeof(*cmod), GFP_KERNEL);
-+	if (unlikely(!cmod))
-+		return -ENOMEM;
-+
-+	cmod->mod = mod;
-+	cmod->range = range;
-+
-+	down_write(&cttype->mod_lock);
-+	err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
-+	if (err >= 0) {
-+		cttype->count += range_size(cttype, &range);
-+		if (cttype->desc.module_load)
-+			cttype->desc.module_load(cttype, cmod);
-+	}
-+	up_write(&cttype->mod_lock);
-+
-+	if (err < 0) {
-+		kfree(cmod);
-+		return err;
-+	}
-+
-+	return 0;
-+}
-+
-+struct codetag_type *
-+codetag_register_type(const struct codetag_type_desc *desc)
-+{
-+	struct codetag_type *cttype;
-+	int err;
-+
-+	BUG_ON(desc->tag_size <= 0);
-+
-+	cttype = kzalloc(sizeof(*cttype), GFP_KERNEL);
-+	if (unlikely(!cttype))
-+		return ERR_PTR(-ENOMEM);
-+
-+	cttype->desc = *desc;
-+	idr_init(&cttype->mod_idr);
-+	init_rwsem(&cttype->mod_lock);
-+
-+	err = codetag_module_init(cttype, NULL);
-+	if (unlikely(err)) {
-+		kfree(cttype);
-+		return ERR_PTR(err);
-+	}
-+
-+	mutex_lock(&codetag_lock);
-+	list_add_tail(&cttype->link, &codetag_types);
-+	mutex_unlock(&codetag_lock);
-+
-+	return cttype;
-+}
-+
-+void codetag_load_module(struct module *mod)
-+{
-+	struct codetag_type *cttype;
-+
-+	if (!mod)
-+		return;
-+
-+	mutex_lock(&codetag_lock);
-+	list_for_each_entry(cttype, &codetag_types, link)
-+		codetag_module_init(cttype, mod);
-+	mutex_unlock(&codetag_lock);
-+}
-+
-+bool codetag_unload_module(struct module *mod)
-+{
-+	struct codetag_type *cttype;
-+	bool unload_ok = true;
-+
-+	if (!mod)
-+		return true;
-+
-+	mutex_lock(&codetag_lock);
-+	list_for_each_entry(cttype, &codetag_types, link) {
-+		struct codetag_module *found = NULL;
-+		struct codetag_module *cmod;
-+		unsigned long mod_id, tmp;
-+
-+		down_write(&cttype->mod_lock);
-+		idr_for_each_entry_ul(&cttype->mod_idr, cmod, tmp, mod_id) {
-+			if (cmod->mod && cmod->mod == mod) {
-+				found = cmod;
-+				break;
-+			}
-+		}
-+		if (found) {
-+			if (cttype->desc.module_unload)
-+				if (!cttype->desc.module_unload(cttype, cmod))
-+					unload_ok = false;
-+
-+			cttype->count -= range_size(cttype, &cmod->range);
-+			idr_remove(&cttype->mod_idr, mod_id);
-+			kfree(cmod);
-+		}
-+		up_write(&cttype->mod_lock);
-+	}
-+	mutex_unlock(&codetag_lock);
-+
-+	return unload_ok;
-+}
-+
-+/* Codetag query parsing */
-+
-+#define CODETAG_QUERY_TOKENS()	\
-+	x(func)			\
-+	x(file)			\
-+	x(line)			\
-+	x(module)		\
-+	x(class)		\
-+	x(index)
-+
-+enum tokens {
-+#define x(name)		TOK_##name,
-+	CODETAG_QUERY_TOKENS()
-+#undef x
-+};
-+
-+static const char * const token_strs[] = {
-+#define x(name)		#name,
-+	CODETAG_QUERY_TOKENS()
-+#undef x
-+	NULL
-+};
-+
-+static int parse_range(char *str, unsigned int *first, unsigned int *last)
-+{
-+	char *first_str = str;
-+	char *last_str = strchr(first_str, '-');
-+
-+	if (last_str)
-+		*last_str++ = '\0';
-+
-+	if (kstrtouint(first_str, 10, first))
-+		return -EINVAL;
-+
-+	if (!last_str)
-+		*last = *first;
-+	else if (kstrtouint(last_str, 10, last))
-+		return -EINVAL;
-+
-+	return 0;
-+}
-+
-+char *codetag_query_parse(struct codetag_query *q, char *buf)
-+{
-+	while (1) {
-+		char *p = buf;
-+		char *str1 = strsep_no_empty(&p, " \t\r\n");
-+		char *str2 = strsep_no_empty(&p, " \t\r\n");
-+		int ret, token;
-+
-+		if (!str1 || !str2)
-+			break;
-+
-+		token = match_string(token_strs, ARRAY_SIZE(token_strs), str1);
-+		if (token < 0)
-+			break;
-+
-+		switch (token) {
-+		case TOK_func:
-+			q->function = str2;
-+			break;
-+		case TOK_file:
-+			q->filename = str2;
-+			break;
-+		case TOK_line:
-+			ret = parse_range(str2, &q->first_line, &q->last_line);
-+			if (ret)
-+				return ERR_PTR(ret);
-+			q->match_line = true;
-+			break;
-+		case TOK_module:
-+			q->module = str2;
-+			break;
-+		case TOK_class:
-+			q->class = str2;
-+			break;
-+		case TOK_index:
-+			ret = parse_range(str2, &q->first_index, &q->last_index);
-+			if (ret)
-+				return ERR_PTR(ret);
-+			q->match_index = true;
-+			break;
-+		}
-+
-+		buf = p;
-+	}
-+
-+	return buf;
-+}
-+
-+bool codetag_matches_query(struct codetag_query *q,
-+			   const struct codetag *ct,
-+			   const struct codetag_module *mod,
-+			   const char *class)
-+{
-+	size_t classlen = q->class ? strlen(q->class) : 0;
-+
-+	if (q->module &&
-+	    (!mod->mod ||
-+	     strcmp(q->module, ct->modname)))
-+		return false;
-+
-+	if (q->filename &&
-+	    strcmp(q->filename, ct->filename) &&
-+	    strcmp(q->filename, kbasename(ct->filename)))
-+		return false;
-+
-+	if (q->function &&
-+	    strcmp(q->function, ct->function))
-+		return false;
-+
-+	/* match against the line number range */
-+	if (q->match_line &&
-+	    (ct->lineno < q->first_line ||
-+	     ct->lineno > q->last_line))
-+		return false;
-+
-+	/* match against the class */
-+	if (classlen &&
-+	    (strncmp(q->class, class, classlen) ||
-+	     (class[classlen] && class[classlen] != ':')))
-+		return false;
-+
-+	/* match against the fault index */
-+	if (q->match_index &&
-+	    (q->cur_index < q->first_index ||
-+	     q->cur_index > q->last_index)) {
-+		q->cur_index++;
-+		return false;
-+	}
-+
-+	q->cur_index++;
-+	return true;
-+}
-diff --git a/lib/dynamic_fault.c b/lib/dynamic_fault.c
-new file mode 100644
-index 000000000..c92374359
---- /dev/null
-+++ b/lib/dynamic_fault.c
-@@ -0,0 +1,371 @@
-+// SPDX-License-Identifier: GPL-2.0-only
-+
-+#include <linux/ctype.h>
-+#include <linux/debugfs.h>
-+#include <linux/dynamic_fault.h>
-+#include <linux/kernel.h>
-+#include <linux/module.h>
-+#include <linux/seq_buf.h>
-+
-+static struct codetag_type *cttype;
-+
-+bool __dynamic_fault_enabled(struct dfault *df)
-+{
-+	union dfault_state old, new;
-+	unsigned int v = df->state.v;
-+	bool ret;
-+
-+	do {
-+		old.v = new.v = v;
-+
-+		if (new.enabled == DFAULT_disabled)
-+			return false;
-+
-+		ret = df->frequency
-+			? ++new.count >= df->frequency
-+			: true;
-+		if (ret)
-+			new.count = 0;
-+		if (ret && new.enabled == DFAULT_oneshot)
-+			new.enabled = DFAULT_disabled;
-+	} while ((v = cmpxchg(&df->state.v, old.v, new.v)) != old.v);
-+
-+	if (ret)
-+		pr_debug("returned true for %s:%u", df->tag.filename, df->tag.lineno);
-+
-+	return ret;
-+}
-+EXPORT_SYMBOL(__dynamic_fault_enabled);
-+
-+static const char * const dfault_state_strs[] = {
-+#define x(n)	#n,
-+	DFAULT_STATES()
-+#undef x
-+	NULL
-+};
-+
-+static void dynamic_fault_to_text(struct seq_buf *out, struct dfault *df)
-+{
-+	codetag_to_text(out, &df->tag);
-+	seq_buf_printf(out, "class:%s %s \"", df->class,
-+		       dfault_state_strs[df->state.enabled]);
-+}
-+
-+struct dfault_query {
-+	struct codetag_query q;
-+
-+	bool		set_enabled:1;
-+	unsigned int	enabled:2;
-+
-+	bool		set_frequency:1;
-+	unsigned int	frequency;
-+};
-+
-+/*
-+ * Search the tables for _dfault's which match the given
-+ * `query' and apply the `flags' and `mask' to them.  Tells
-+ * the user which dfault's were changed, or whether none
-+ * were matched.
-+ */
-+static int dfault_change(struct dfault_query *query)
-+{
-+	struct codetag_iterator ct_iter = codetag_get_ct_iter(cttype);
-+	struct codetag *ct;
-+	unsigned int nfound = 0;
-+
-+	codetag_lock_module_list(cttype, true);
-+
-+	while ((ct = codetag_next_ct(&ct_iter))) {
-+		struct dfault *df = container_of(ct, struct dfault, tag);
-+
-+		if (!codetag_matches_query(&query->q, ct, ct_iter.cmod, df->class))
-+			continue;
-+
-+		if (query->set_enabled &&
-+		    query->enabled != df->state.enabled) {
-+			if (query->enabled != DFAULT_disabled)
-+				static_key_slow_inc(&df->enabled.key);
-+			else if (df->state.enabled != DFAULT_disabled)
-+				static_key_slow_dec(&df->enabled.key);
-+
-+			df->state.enabled = query->enabled;
-+		}
-+
-+		if (query->set_frequency)
-+			df->frequency = query->frequency;
-+
-+		pr_debug("changed %s:%d [%s]%s #%d %s",
-+			 df->tag.filename, df->tag.lineno, df->tag.modname,
-+			 df->tag.function, query->q.cur_index,
-+			 dfault_state_strs[df->state.enabled]);
-+
-+		nfound++;
-+	}
-+
-+	pr_debug("dfault: %u matches", nfound);
-+
-+	codetag_lock_module_list(cttype, false);
-+
-+	return nfound ? 0 : -ENOENT;
-+}
-+
-+#define DFAULT_TOKENS()		\
-+	x(disable,	0)	\
-+	x(enable,	0)	\
-+	x(oneshot,	0)	\
-+	x(frequency,	1)
-+
-+enum dfault_token {
-+#define x(name, nr_args)	TOK_##name,
-+	DFAULT_TOKENS()
-+#undef x
-+};
-+
-+static const char * const dfault_token_strs[] = {
-+#define x(name, nr_args)	#name,
-+	DFAULT_TOKENS()
-+#undef x
-+	NULL
-+};
-+
-+static unsigned int dfault_token_nr_args[] = {
-+#define x(name, nr_args)	nr_args,
-+	DFAULT_TOKENS()
-+#undef x
-+};
-+
-+static enum dfault_token str_to_token(const char *word, unsigned int nr_words)
-+{
-+	int tok = match_string(dfault_token_strs, ARRAY_SIZE(dfault_token_strs), word);
-+
-+	if (tok < 0) {
-+		pr_debug("unknown keyword \"%s\"", word);
-+		return tok;
-+	}
-+
-+	if (nr_words < dfault_token_nr_args[tok]) {
-+		pr_debug("insufficient arguments to \"%s\"", word);
-+		return -EINVAL;
-+	}
-+
-+	return tok;
-+}
-+
-+static int dfault_parse_command(struct dfault_query *query,
-+				enum dfault_token tok,
-+				char *words[], size_t nr_words)
-+{
-+	unsigned int i = 0;
-+	int ret;
-+
-+	switch (tok) {
-+	case TOK_disable:
-+		query->set_enabled = true;
-+		query->enabled = DFAULT_disabled;
-+		break;
-+	case TOK_enable:
-+		query->set_enabled = true;
-+		query->enabled = DFAULT_enabled;
-+		break;
-+	case TOK_oneshot:
-+		query->set_enabled = true;
-+		query->enabled = DFAULT_oneshot;
-+		break;
-+	case TOK_frequency:
-+		query->set_frequency = 1;
-+		ret = kstrtouint(words[i++], 10, &query->frequency);
-+		if (ret)
-+			return ret;
-+
-+		if (!query->set_enabled) {
-+			query->set_enabled = 1;
-+			query->enabled = DFAULT_enabled;
-+		}
-+		break;
-+	}
-+
-+	return i;
-+}
-+
-+static int dynamic_fault_store(char *buf)
-+{
-+	struct dfault_query query = { NULL };
-+#define MAXWORDS 9
-+	char *tok, *words[MAXWORDS];
-+	int ret, nr_words, i = 0;
-+
-+	buf = codetag_query_parse(&query.q, buf);
-+	if (IS_ERR(buf))
-+		return PTR_ERR(buf);
-+
-+	while ((tok = strsep_no_empty(&buf, " \t\r\n"))) {
-+		if (nr_words == ARRAY_SIZE(words))
-+			return -EINVAL;	/* ran out of words[] before bytes */
-+		words[nr_words++] = tok;
-+	}
-+
-+	while (i < nr_words) {
-+		const char *tok_str = words[i++];
-+		enum dfault_token tok = str_to_token(tok_str, nr_words - i);
-+
-+		if (tok < 0)
-+			return tok;
-+
-+		ret = dfault_parse_command(&query, tok, words + i, nr_words - i);
-+		if (ret < 0)
-+			return ret;
-+
-+		i += ret;
-+		BUG_ON(i > nr_words);
-+	}
-+
-+	pr_debug("q->function=\"%s\" q->filename=\"%s\" "
-+		 "q->module=\"%s\" q->line=%u-%u\n q->index=%u-%u",
-+		 query.q.function, query.q.filename, query.q.module,
-+		 query.q.first_line, query.q.last_line,
-+		 query.q.first_index, query.q.last_index);
-+
-+	ret = dfault_change(&query);
-+	if (ret < 0)
-+		return ret;
-+
-+	return 0;
-+}
-+
-+struct dfault_iter {
-+	struct codetag_iterator ct_iter;
-+
-+	struct seq_buf		buf;
-+	char			rawbuf[4096];
-+};
-+
-+static int dfault_open(struct inode *inode, struct file *file)
-+{
-+	struct dfault_iter *iter;
-+
-+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-+	if (!iter)
-+		return -ENOMEM;
-+
-+	codetag_lock_module_list(cttype, true);
-+	iter->ct_iter = codetag_get_ct_iter(cttype);
-+	codetag_lock_module_list(cttype, false);
-+
-+	file->private_data = iter;
-+	seq_buf_init(&iter->buf, iter->rawbuf, sizeof(iter->rawbuf));
-+	return 0;
-+}
-+
-+static int dfault_release(struct inode *inode, struct file *file)
-+{
-+	struct dfault_iter *iter = file->private_data;
-+
-+	kfree(iter);
-+	return 0;
-+}
-+
-+struct user_buf {
-+	char __user		*buf;	/* destination user buffer */
-+	size_t			size;	/* size of requested read */
-+	ssize_t			ret;	/* bytes read so far */
-+};
-+
-+static int flush_ubuf(struct user_buf *dst, struct seq_buf *src)
-+{
-+	if (src->len) {
-+		size_t bytes = min_t(size_t, src->len, dst->size);
-+		int err = copy_to_user(dst->buf, src->buffer, bytes);
-+
-+		if (err)
-+			return err;
-+
-+		dst->ret	+= bytes;
-+		dst->buf	+= bytes;
-+		dst->size	-= bytes;
-+		src->len	-= bytes;
-+		memmove(src->buffer, src->buffer + bytes, src->len);
-+	}
-+
-+	return 0;
-+}
-+
-+static ssize_t dfault_read(struct file *file, char __user *ubuf,
-+			   size_t size, loff_t *ppos)
-+{
-+	struct dfault_iter *iter = file->private_data;
-+	struct user_buf	buf = { .buf = ubuf, .size = size };
-+	struct codetag *ct;
-+	struct dfault *df;
-+	int err;
-+
-+	codetag_lock_module_list(iter->ct_iter.cttype, true);
-+	while (1) {
-+		err = flush_ubuf(&buf, &iter->buf);
-+		if (err || !buf.size)
-+			break;
-+
-+		ct = codetag_next_ct(&iter->ct_iter);
-+		if (!ct)
-+			break;
-+
-+		df = container_of(ct, struct dfault, tag);
-+		dynamic_fault_to_text(&iter->buf, df);
-+		seq_buf_putc(&iter->buf, '\n');
-+	}
-+	codetag_lock_module_list(iter->ct_iter.cttype, false);
-+
-+	return err ?: buf.ret;
-+}
-+
-+/*
-+ * File_ops->write method for <debugfs>/dynamic_fault/conrol.  Gathers the
-+ * command text from userspace, parses and executes it.
-+ */
-+static ssize_t dfault_write(struct file *file, const char __user *ubuf,
-+			    size_t len, loff_t *offp)
-+{
-+	char tmpbuf[256];
-+
-+	if (len == 0)
-+		return 0;
-+	/* we don't check *offp -- multiple writes() are allowed */
-+	if (len > sizeof(tmpbuf)-1)
-+		return -E2BIG;
-+	if (copy_from_user(tmpbuf, ubuf, len))
-+		return -EFAULT;
-+	tmpbuf[len] = '\0';
-+	pr_debug("read %zu bytes from userspace", len);
-+
-+	dynamic_fault_store(tmpbuf);
-+
-+	*offp += len;
-+	return len;
-+}
-+
-+static const struct file_operations dfault_ops = {
-+	.owner	= THIS_MODULE,
-+	.open	= dfault_open,
-+	.release = dfault_release,
-+	.read	= dfault_read,
-+	.write	= dfault_write
-+};
-+
-+static int __init dynamic_fault_init(void)
-+{
-+	const struct codetag_type_desc desc = {
-+		.section = "dynamic_fault_tags",
-+		.tag_size = sizeof(struct dfault),
-+	};
-+	struct dentry *debugfs_file;
-+
-+	cttype = codetag_register_type(&desc);
-+	if (IS_ERR_OR_NULL(cttype))
-+		return PTR_ERR(cttype);
-+
-+	debugfs_file = debugfs_create_file("dynamic_faults", 0666, NULL, NULL, &dfault_ops);
-+	if (IS_ERR(debugfs_file))
-+		return PTR_ERR(debugfs_file);
-+
-+	return 0;
-+}
-+module_init(dynamic_fault_init);
 diff --git a/lib/errname.c b/lib/errname.c
 index 67739b174..dd1b99855 100644
 --- a/lib/errname.c
@@ -99840,10 +97788,10 @@ index f25eb111c..41f1bcdc4 100644
  {
  	if (level) {
 diff --git a/lib/iov_iter.c b/lib/iov_iter.c
-index 960223ed9..f9c4bba27 100644
+index e4dc809d1..eb3dffb24 100644
 --- a/lib/iov_iter.c
 +++ b/lib/iov_iter.c
-@@ -857,24 +857,37 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
+@@ -566,24 +566,37 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
  }
  EXPORT_SYMBOL(iov_iter_zero);
  
@@ -100330,49 +98278,10 @@ index 000000000..f45591a16
 +MODULE_AUTHOR("Daniel B. Hill");
 +MODULE_LICENSE("GPL");
 diff --git a/lib/rhashtable.c b/lib/rhashtable.c
-index 6ae2ba8e0..76e5bf9be 100644
+index 6ae2ba8e0..d3fce9c89 100644
 --- a/lib/rhashtable.c
 +++ b/lib/rhashtable.c
-@@ -130,7 +130,7 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht,
- 	if (ntbl)
- 		return ntbl;
- 
--	ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
-+	ntbl = kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO);
- 
- 	if (ntbl && leaf) {
- 		for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
-@@ -157,7 +157,7 @@ static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
- 
- 	size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
- 
--	tbl = kzalloc(size, gfp);
-+	tbl = kmalloc_noprof(size, gfp|__GFP_ZERO);
- 	if (!tbl)
- 		return NULL;
- 
-@@ -180,8 +180,10 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
- 	size_t size;
- 	int i;
- 	static struct lock_class_key __key;
-+	struct alloc_tag * __maybe_unused old = alloc_tag_save(ht->alloc_tag);
- 
--	tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp);
-+	tbl = kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets),
-+				   gfp|__GFP_ZERO, NUMA_NO_NODE);
- 
- 	size = nbuckets;
- 
-@@ -190,6 +192,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
- 		nbuckets = 0;
- 	}
- 
-+	alloc_tag_restore(ht->alloc_tag, old);
-+
- 	if (tbl == NULL)
- 		return NULL;
- 
-@@ -360,9 +364,14 @@ static int rhashtable_rehash_alloc(struct rhashtable *ht,
+@@ -360,9 +360,14 @@ static int rhashtable_rehash_alloc(struct rhashtable *ht,
  
  	ASSERT_RHT_MUTEX(ht);
  
@@ -100389,85 +98298,6 @@ index 6ae2ba8e0..76e5bf9be 100644
  
  	err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
  	if (err)
-@@ -975,7 +984,7 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
- }
- 
- /**
-- * rhashtable_init - initialize a new hash table
-+ * rhashtable_init_noprof - initialize a new hash table
-  * @ht:		hash table to be initialized
-  * @params:	configuration parameters
-  *
-@@ -1016,7 +1025,7 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
-  *	.obj_hashfn = my_hash_fn,
-  * };
-  */
--int rhashtable_init(struct rhashtable *ht,
-+int rhashtable_init_noprof(struct rhashtable *ht,
- 		    const struct rhashtable_params *params)
- {
- 	struct bucket_table *tbl;
-@@ -1031,6 +1040,10 @@ int rhashtable_init(struct rhashtable *ht,
- 	spin_lock_init(&ht->lock);
- 	memcpy(&ht->p, params, sizeof(*params));
- 
-+#ifdef CONFIG_MEM_ALLOC_PROFILING
-+	ht->alloc_tag = current->alloc_tag;
-+#endif
-+
- 	if (params->min_size)
- 		ht->p.min_size = roundup_pow_of_two(params->min_size);
- 
-@@ -1076,26 +1089,26 @@ int rhashtable_init(struct rhashtable *ht,
- 
- 	return 0;
- }
--EXPORT_SYMBOL_GPL(rhashtable_init);
-+EXPORT_SYMBOL_GPL(rhashtable_init_noprof);
- 
- /**
-- * rhltable_init - initialize a new hash list table
-+ * rhltable_init_noprof - initialize a new hash list table
-  * @hlt:	hash list table to be initialized
-  * @params:	configuration parameters
-  *
-  * Initializes a new hash list table.
-  *
-- * See documentation for rhashtable_init.
-+ * See documentation for rhashtable_init_noprof.
-  */
--int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
-+int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params)
- {
- 	int err;
- 
--	err = rhashtable_init(&hlt->ht, params);
-+	err = rhashtable_init_noprof(&hlt->ht, params);
- 	hlt->ht.rhlist = true;
- 	return err;
- }
--EXPORT_SYMBOL_GPL(rhltable_init);
-+EXPORT_SYMBOL_GPL(rhltable_init_noprof);
- 
- static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
- 				void (*free_fn)(void *ptr, void *arg),
-@@ -1222,6 +1235,7 @@ struct rhash_lock_head __rcu **rht_bucket_nested_insert(
- 	unsigned int index = hash & ((1 << tbl->nest) - 1);
- 	unsigned int size = tbl->size >> tbl->nest;
- 	union nested_table *ntbl;
-+	struct alloc_tag * __maybe_unused old = alloc_tag_save(ht->alloc_tag);
- 
- 	ntbl = nested_table_top(tbl);
- 	hash >>= tbl->nest;
-@@ -1236,6 +1250,8 @@ struct rhash_lock_head __rcu **rht_bucket_nested_insert(
- 					  size <= (1 << shift));
- 	}
- 
-+	alloc_tag_restore(ht->alloc_tag, old);
-+
- 	if (!ntbl)
- 		return NULL;
- 
 diff --git a/lib/seq_buf.c b/lib/seq_buf.c
 index 45c450f42..2b87e9219 100644
 --- a/lib/seq_buf.c
@@ -100486,38 +98316,8 @@ index 45c450f42..2b87e9219 100644
 +	seq_buf_commit(s, wrote);
 +}
 +EXPORT_SYMBOL(seq_buf_human_readable_u64);
-diff --git a/lib/string.c b/lib/string.c
-index 3d55ef890..dd4914baf 100644
---- a/lib/string.c
-+++ b/lib/string.c
-@@ -520,6 +520,25 @@ char *strsep(char **s, const char *ct)
- EXPORT_SYMBOL(strsep);
- #endif
- 
-+/**
-+ * strsep_no_empt - Split a string into tokens, but don't return empty tokens
-+ * @s: The string to be searched
-+ * @ct: The characters to search for
-+ *
-+ * strsep() updates @s to point after the token, ready for the next call.
-+ */
-+char *strsep_no_empty(char **s, const char *ct)
-+{
-+	char *ret;
-+
-+	do {
-+		ret = strsep(s, ct);
-+	} while (ret && !*ret);
-+
-+	return ret;
-+}
-+EXPORT_SYMBOL_GPL(strsep_no_empty);
-+
- #ifndef __HAVE_ARCH_MEMSET
- /**
-  * memset - Fill a region of memory with the given value
 diff --git a/lib/string_helpers.c b/lib/string_helpers.c
-index 230020a2e..d527ce455 100644
+index d3b1dd718..c29dd105b 100644
 --- a/lib/string_helpers.c
 +++ b/lib/string_helpers.c
 @@ -19,11 +19,17 @@
@@ -100591,97 +98391,11 @@ index 9a68849a5..0b01ffca9 100644
  
  	test_string_get_size_check("STRING_UNITS_10", exp_result10, buf10,
  				   size, blk_size);
-diff --git a/mm/Makefile b/mm/Makefile
-index e29afc890..e2ecfe0ea 100644
---- a/mm/Makefile
-+++ b/mm/Makefile
-@@ -53,7 +53,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
- 			   mm_init.o percpu.o slab_common.o \
- 			   compaction.o \
- 			   interval_tree.o list_lru.o workingset.o \
--			   debug.o gup.o mmap_lock.o $(mmu-y)
-+			   debug.o gup.o mmap_lock.o show_mem.o $(mmu-y)
- 
- # Give 'page_alloc' its own module-parameter namespace
- page-alloc-y := page_alloc.o
-diff --git a/mm/compaction.c b/mm/compaction.c
-index c8bcdea15..09dd56a94 100644
---- a/mm/compaction.c
-+++ b/mm/compaction.c
-@@ -1684,8 +1684,8 @@ static void isolate_freepages(struct compact_control *cc)
-  * This is a migrate-callback that "allocates" freepages by taking pages
-  * from the isolated freelists in the block we are migrating to.
-  */
--static struct page *compaction_alloc(struct page *migratepage,
--					unsigned long data)
-+static struct page *compaction_alloc_noprof(struct page *migratepage,
-+					    unsigned long data)
- {
- 	struct compact_control *cc = (struct compact_control *)data;
- 	struct page *freepage;
-@@ -1704,6 +1704,12 @@ static struct page *compaction_alloc(struct page *migratepage,
- 	return freepage;
- }
- 
-+static struct page *compaction_alloc(struct page *migratepage,
-+				     unsigned long data)
-+{
-+	return alloc_hooks(compaction_alloc_noprof(migratepage, data));
-+}
-+
- /*
-  * This is a migrate-callback that "frees" freepages back to the isolated
-  * freelist.  All pages on the freelist are from the same zone, so there is no
-diff --git a/mm/filemap.c b/mm/filemap.c
-index 8abce63b2..e38eec523 100644
---- a/mm/filemap.c
-+++ b/mm/filemap.c
-@@ -958,7 +958,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
- EXPORT_SYMBOL_GPL(filemap_add_folio);
- 
- #ifdef CONFIG_NUMA
--struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
-+struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
- {
- 	int n;
- 	struct folio *folio;
-@@ -973,9 +973,9 @@ struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
- 
- 		return folio;
- 	}
--	return folio_alloc(gfp, order);
-+	return folio_alloc_noprof(gfp, order);
- }
--EXPORT_SYMBOL(filemap_alloc_folio);
-+EXPORT_SYMBOL(filemap_alloc_folio_noprof);
- #endif
- 
- /*
-diff --git a/mm/huge_memory.c b/mm/huge_memory.c
-index 624671aaa..221cce005 100644
---- a/mm/huge_memory.c
-+++ b/mm/huge_memory.c
-@@ -37,6 +37,7 @@
- #include <linux/page_owner.h>
- #include <linux/sched/sysctl.h>
- #include <linux/memory-tiers.h>
-+#include <linux/pgalloc_tag.h>
- 
- #include <asm/tlb.h>
- #include <asm/pgalloc.h>
-@@ -2557,6 +2558,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
- 	/* Caller disabled irqs, so they are still disabled here */
- 
- 	split_page_owner(head, nr);
-+	pgalloc_tag_split(head, nr);
- 
- 	/* See comment in __split_huge_page_tail() */
- 	if (PageAnon(head)) {
 diff --git a/mm/hugetlb.c b/mm/hugetlb.c
-index f791076da..3e5a604ee 100644
+index 6da626bfb..4165e22b0 100644
 --- a/mm/hugetlb.c
 +++ b/mm/hugetlb.c
-@@ -3246,7 +3246,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
+@@ -3270,7 +3270,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
  	if (i == h->max_huge_pages_node[nid])
  		return;
  
@@ -100690,7 +98404,7 @@ index f791076da..3e5a604ee 100644
  	pr_warn("HugeTLB: allocating %u of page size %s failed node%d.  Only allocated %lu hugepages.\n",
  		h->max_huge_pages_node[nid], buf, nid, i);
  	h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
-@@ -3308,7 +3308,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
+@@ -3332,7 +3332,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
  	if (i < h->max_huge_pages) {
  		char buf[32];
  
@@ -100699,7 +98413,7 @@ index f791076da..3e5a604ee 100644
  		pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
  			h->max_huge_pages, buf, i);
  		h->max_huge_pages = i;
-@@ -3354,7 +3354,7 @@ static void __init report_hugepages(void)
+@@ -3378,7 +3378,7 @@ static void __init report_hugepages(void)
  	for_each_hstate(h) {
  		char buf[32];
  
@@ -100708,7 +98422,7 @@ index f791076da..3e5a604ee 100644
  		pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
  			buf, h->free_huge_pages);
  		pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
-@@ -4245,7 +4245,7 @@ static int __init hugetlb_init(void)
+@@ -4269,7 +4269,7 @@ static int __init hugetlb_init(void)
  				char buf[32];
  
  				string_get_size(huge_page_size(&default_hstate),
@@ -100717,65 +98431,11 @@ index f791076da..3e5a604ee 100644
  				pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
  					default_hstate.max_huge_pages, buf);
  				pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
-diff --git a/mm/kfence/core.c b/mm/kfence/core.c
-index dad3c0eb7..aea6fa145 100644
---- a/mm/kfence/core.c
-+++ b/mm/kfence/core.c
-@@ -590,9 +590,9 @@ static unsigned long kfence_init_pool(void)
- 			continue;
- 
- 		__folio_set_slab(slab_folio(slab));
--#ifdef CONFIG_MEMCG
--		slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
--				   MEMCG_DATA_OBJCGS;
-+#ifdef CONFIG_MEMCG_KMEM
-+		slab->obj_exts = (unsigned long)&kfence_metadata[i / 2 - 1].obj_exts |
-+				 MEMCG_DATA_OBJEXTS;
- #endif
- 	}
- 
-@@ -634,8 +634,8 @@ static unsigned long kfence_init_pool(void)
- 
- 		if (!i || (i % 2))
- 			continue;
--#ifdef CONFIG_MEMCG
--		slab->memcg_data = 0;
-+#ifdef CONFIG_MEMCG_KMEM
-+		slab->obj_exts = 0;
- #endif
- 		__folio_clear_slab(slab_folio(slab));
- 	}
-@@ -1093,8 +1093,8 @@ void __kfence_free(void *addr)
- {
- 	struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
- 
--#ifdef CONFIG_MEMCG
--	KFENCE_WARN_ON(meta->objcg);
-+#ifdef CONFIG_MEMCG_KMEM
-+	KFENCE_WARN_ON(meta->obj_exts.objcg);
- #endif
- 	/*
- 	 * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
-diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
-index 392fb273e..b02d2cb96 100644
---- a/mm/kfence/kfence.h
-+++ b/mm/kfence/kfence.h
-@@ -97,8 +97,8 @@ struct kfence_metadata {
- 	struct kfence_track free_track;
- 	/* For updating alloc_covered on frees. */
- 	u32 alloc_stack_hash;
--#ifdef CONFIG_MEMCG
--	struct obj_cgroup *objcg;
-+#ifdef CONFIG_MEMCG_KMEM
-+	struct slabobj_ext obj_exts;
- #endif
- };
- 
 diff --git a/mm/madvise.c b/mm/madvise.c
-index b5ffbaf61..e08639a7c 100644
+index ec30f48f8..fa2f140d0 100644
 --- a/mm/madvise.c
 +++ b/mm/madvise.c
-@@ -1311,6 +1311,64 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+@@ -1330,6 +1330,64 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
  				 madvise_vma_anon_name);
  }
  #endif /* CONFIG_ANON_VMA_NAME */
@@ -100840,7 +98500,7 @@ index b5ffbaf61..e08639a7c 100644
  /*
   * The madvise(2) system call.
   *
-@@ -1390,6 +1448,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
+@@ -1409,6 +1467,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
  	size_t len;
  	struct blk_plug plug;
  
@@ -100850,372 +98510,8 @@ index b5ffbaf61..e08639a7c 100644
  	if (!madvise_behavior_valid(behavior))
  		return -EINVAL;
  
-diff --git a/mm/memcontrol.c b/mm/memcontrol.c
-index 4b27e245a..f2a7fe718 100644
---- a/mm/memcontrol.c
-+++ b/mm/memcontrol.c
-@@ -2892,13 +2892,6 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
- }
- 
- #ifdef CONFIG_MEMCG_KMEM
--/*
-- * The allocated objcg pointers array is not accounted directly.
-- * Moreover, it should not come from DMA buffer and is not readily
-- * reclaimable. So those GFP bits should be masked off.
-- */
--#define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
--
- /*
-  * mod_objcg_mlstate() may be called with irq enabled, so
-  * mod_memcg_lruvec_state() should be used.
-@@ -2917,62 +2910,27 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
- 	rcu_read_unlock();
- }
- 
--int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
--				 gfp_t gfp, bool new_slab)
--{
--	unsigned int objects = objs_per_slab(s, slab);
--	unsigned long memcg_data;
--	void *vec;
--
--	gfp &= ~OBJCGS_CLEAR_MASK;
--	vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
--			   slab_nid(slab));
--	if (!vec)
--		return -ENOMEM;
--
--	memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
--	if (new_slab) {
--		/*
--		 * If the slab is brand new and nobody can yet access its
--		 * memcg_data, no synchronization is required and memcg_data can
--		 * be simply assigned.
--		 */
--		slab->memcg_data = memcg_data;
--	} else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
--		/*
--		 * If the slab is already in use, somebody can allocate and
--		 * assign obj_cgroups in parallel. In this case the existing
--		 * objcg vector should be reused.
--		 */
--		kfree(vec);
--		return 0;
--	}
--
--	kmemleak_not_leak(vec);
--	return 0;
--}
--
- static __always_inline
- struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
- {
- 	/*
- 	 * Slab objects are accounted individually, not per-page.
- 	 * Memcg membership data for each individual object is saved in
--	 * slab->memcg_data.
-+	 * slab->obj_exts.
- 	 */
- 	if (folio_test_slab(folio)) {
--		struct obj_cgroup **objcgs;
-+		struct slabobj_ext *obj_exts;
- 		struct slab *slab;
- 		unsigned int off;
- 
- 		slab = folio_slab(folio);
--		objcgs = slab_objcgs(slab);
--		if (!objcgs)
-+		obj_exts = slab_obj_exts(slab);
-+		if (!obj_exts)
- 			return NULL;
- 
- 		off = obj_to_index(slab->slab_cache, slab, p);
--		if (objcgs[off])
--			return obj_cgroup_memcg(objcgs[off]);
-+		if (obj_exts[off].objcg)
-+			return obj_cgroup_memcg(obj_exts[off].objcg);
- 
- 		return NULL;
- 	}
-@@ -2980,7 +2938,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
- 	/*
- 	 * folio_memcg_check() is used here, because in theory we can encounter
- 	 * a folio where the slab flag has been cleared already, but
--	 * slab->memcg_data has not been freed yet
-+	 * slab->obj_exts has not been freed yet
- 	 * folio_memcg_check() will guarantee that a proper memory
- 	 * cgroup pointer or NULL will be returned.
- 	 */
-diff --git a/mm/mempolicy.c b/mm/mempolicy.c
-index 1756389a0..aaf767767 100644
---- a/mm/mempolicy.c
-+++ b/mm/mempolicy.c
-@@ -2109,7 +2109,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
- {
- 	struct page *page;
- 
--	page = __alloc_pages(gfp, order, nid, NULL);
-+	page = __alloc_pages_noprof(gfp, order, nid, NULL);
- 	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
- 	if (!static_branch_likely(&vm_numa_stat_key))
- 		return page;
-@@ -2135,15 +2135,15 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
- 	 */
- 	preferred_gfp = gfp | __GFP_NOWARN;
- 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
--	page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
-+	page = __alloc_pages_noprof(preferred_gfp, order, nid, &pol->nodes);
- 	if (!page)
--		page = __alloc_pages(gfp, order, nid, NULL);
-+		page = __alloc_pages_noprof(gfp, order, nid, NULL);
- 
- 	return page;
- }
- 
- /**
-- * vma_alloc_folio - Allocate a folio for a VMA.
-+ * vma_alloc_folio_noprof - Allocate a folio for a VMA.
-  * @gfp: GFP flags.
-  * @order: Order of the folio.
-  * @vma: Pointer to VMA or NULL if not available.
-@@ -2157,7 +2157,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
-  *
-  * Return: The folio on success or NULL if allocation fails.
-  */
--struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
-+struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
- 		unsigned long addr, bool hugepage)
- {
- 	struct mempolicy *pol;
-@@ -2228,7 +2228,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
- 			 * memory with both reclaim and compact as well.
- 			 */
- 			if (!folio && (gfp & __GFP_DIRECT_RECLAIM))
--				folio = __folio_alloc(gfp, order, hpage_node,
-+				folio = __folio_alloc_noprof(gfp, order, hpage_node,
- 						      nmask);
- 
- 			goto out;
-@@ -2237,15 +2237,15 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
- 
- 	nmask = policy_nodemask(gfp, pol);
- 	preferred_nid = policy_node(gfp, pol, node);
--	folio = __folio_alloc(gfp, order, preferred_nid, nmask);
-+	folio = __folio_alloc_noprof(gfp, order, preferred_nid, nmask);
- 	mpol_cond_put(pol);
- out:
- 	return folio;
- }
--EXPORT_SYMBOL(vma_alloc_folio);
-+EXPORT_SYMBOL(vma_alloc_folio_noprof);
- 
- /**
-- * alloc_pages - Allocate pages.
-+ * alloc_pages_noprof - Allocate pages.
-  * @gfp: GFP flags.
-  * @order: Power of two of number of pages to allocate.
-  *
-@@ -2258,7 +2258,7 @@ EXPORT_SYMBOL(vma_alloc_folio);
-  * flags are used.
-  * Return: The page on success or NULL if allocation fails.
-  */
--struct page *alloc_pages(gfp_t gfp, unsigned order)
-+struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
- {
- 	struct mempolicy *pol = &default_policy;
- 	struct page *page;
-@@ -2276,23 +2276,23 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
- 		page = alloc_pages_preferred_many(gfp, order,
- 				  policy_node(gfp, pol, numa_node_id()), pol);
- 	else
--		page = __alloc_pages(gfp, order,
-+		page = __alloc_pages_noprof(gfp, order,
- 				policy_node(gfp, pol, numa_node_id()),
- 				policy_nodemask(gfp, pol));
- 
- 	return page;
- }
--EXPORT_SYMBOL(alloc_pages);
-+EXPORT_SYMBOL(alloc_pages_noprof);
- 
--struct folio *folio_alloc(gfp_t gfp, unsigned order)
-+struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
- {
--	struct page *page = alloc_pages(gfp | __GFP_COMP, order);
-+	struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order);
- 
- 	if (page && order > 1)
- 		prep_transhuge_page(page);
- 	return (struct folio *)page;
- }
--EXPORT_SYMBOL(folio_alloc);
-+EXPORT_SYMBOL(folio_alloc_noprof);
- 
- static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
- 		struct mempolicy *pol, unsigned long nr_pages,
-@@ -2311,13 +2311,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
- 
- 	for (i = 0; i < nodes; i++) {
- 		if (delta) {
--			nr_allocated = __alloc_pages_bulk(gfp,
-+			nr_allocated = alloc_pages_bulk_noprof(gfp,
- 					interleave_nodes(pol), NULL,
- 					nr_pages_per_node + 1, NULL,
- 					page_array);
- 			delta--;
- 		} else {
--			nr_allocated = __alloc_pages_bulk(gfp,
-+			nr_allocated = alloc_pages_bulk_noprof(gfp,
- 					interleave_nodes(pol), NULL,
- 					nr_pages_per_node, NULL, page_array);
- 		}
-@@ -2339,11 +2339,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
- 	preferred_gfp = gfp | __GFP_NOWARN;
- 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
- 
--	nr_allocated  = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
-+	nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
- 					   nr_pages, NULL, page_array);
- 
- 	if (nr_allocated < nr_pages)
--		nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
-+		nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
- 				nr_pages - nr_allocated, NULL,
- 				page_array + nr_allocated);
- 	return nr_allocated;
-@@ -2355,7 +2355,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
-  * It can accelerate memory allocation especially interleaving
-  * allocate memory.
-  */
--unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
-+unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
- 		unsigned long nr_pages, struct page **page_array)
- {
- 	struct mempolicy *pol = &default_policy;
-@@ -2371,7 +2371,7 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
- 		return alloc_pages_bulk_array_preferred_many(gfp,
- 				numa_node_id(), pol, nr_pages, page_array);
- 
--	return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
-+	return alloc_pages_bulk_noprof(gfp, policy_node(gfp, pol, numa_node_id()),
- 				  policy_nodemask(gfp, pol), nr_pages, NULL,
- 				  page_array);
- }
-diff --git a/mm/mempool.c b/mm/mempool.c
-index 734bcf5af..4fd949178 100644
---- a/mm/mempool.c
-+++ b/mm/mempool.c
-@@ -230,17 +230,17 @@ EXPORT_SYMBOL(mempool_init_node);
-  *
-  * Return: %0 on success, negative error code otherwise.
-  */
--int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
--		 mempool_free_t *free_fn, void *pool_data)
-+int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
-+			mempool_free_t *free_fn, void *pool_data)
- {
- 	return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
- 				 pool_data, GFP_KERNEL, NUMA_NO_NODE);
- 
- }
--EXPORT_SYMBOL(mempool_init);
-+EXPORT_SYMBOL(mempool_init_noprof);
- 
- /**
-- * mempool_create - create a memory pool
-+ * mempool_create_node - create a memory pool
-  * @min_nr:    the minimum number of elements guaranteed to be
-  *             allocated for this pool.
-  * @alloc_fn:  user-defined element-allocation function.
-@@ -255,17 +255,9 @@ EXPORT_SYMBOL(mempool_init);
-  *
-  * Return: pointer to the created memory pool object or %NULL on error.
-  */
--mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
--				mempool_free_t *free_fn, void *pool_data)
--{
--	return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data,
--				   GFP_KERNEL, NUMA_NO_NODE);
--}
--EXPORT_SYMBOL(mempool_create);
--
--mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
--			       mempool_free_t *free_fn, void *pool_data,
--			       gfp_t gfp_mask, int node_id)
-+mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn,
-+				      mempool_free_t *free_fn, void *pool_data,
-+				      gfp_t gfp_mask, int node_id)
- {
- 	mempool_t *pool;
- 
-@@ -281,7 +273,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
- 
- 	return pool;
- }
--EXPORT_SYMBOL(mempool_create_node);
-+EXPORT_SYMBOL(mempool_create_node_noprof);
- 
- /**
-  * mempool_resize - resize an existing memory pool
-@@ -377,7 +369,7 @@ EXPORT_SYMBOL(mempool_resize);
-  *
-  * Return: pointer to the allocated element or %NULL on error.
-  */
--void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
-+void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask)
- {
- 	void *element;
- 	unsigned long flags;
-@@ -444,7 +436,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
- 	finish_wait(&pool->wait, &wait);
- 	goto repeat_alloc;
- }
--EXPORT_SYMBOL(mempool_alloc);
-+EXPORT_SYMBOL(mempool_alloc_noprof);
- 
- /**
-  * mempool_free - return an element to the pool.
-@@ -515,7 +507,7 @@ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
- {
- 	struct kmem_cache *mem = pool_data;
- 	VM_BUG_ON(mem->ctor);
--	return kmem_cache_alloc(mem, gfp_mask);
-+	return kmem_cache_alloc_noprof(mem, gfp_mask);
- }
- EXPORT_SYMBOL(mempool_alloc_slab);
- 
-@@ -533,7 +525,7 @@ EXPORT_SYMBOL(mempool_free_slab);
- void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
- {
- 	size_t size = (size_t)pool_data;
--	return kmalloc(size, gfp_mask);
-+	return kmalloc_noprof(size, gfp_mask);
- }
- EXPORT_SYMBOL(mempool_kmalloc);
- 
-@@ -550,7 +542,7 @@ EXPORT_SYMBOL(mempool_kfree);
- void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
- {
- 	int order = (int)(long)pool_data;
--	return alloc_pages(gfp_mask, order);
-+	return alloc_pages_noprof(gfp_mask, order);
- }
- EXPORT_SYMBOL(mempool_alloc_pages);
- 
-diff --git a/mm/mm_init.c b/mm/mm_init.c
-index 7f7f9c677..42135fad4 100644
---- a/mm/mm_init.c
-+++ b/mm/mm_init.c
-@@ -24,6 +24,7 @@
- #include <linux/page_ext.h>
- #include <linux/pti.h>
- #include <linux/pgtable.h>
-+#include <linux/stackdepot.h>
- #include <linux/swap.h>
- #include <linux/cma.h>
- #include "internal.h"
 diff --git a/mm/oom_kill.c b/mm/oom_kill.c
-index 044e1eed7..f2657245e 100644
+index 612b5597d..467cff51f 100644
 --- a/mm/oom_kill.c
 +++ b/mm/oom_kill.c
 @@ -168,27 +168,6 @@ static bool oom_unkillable_task(struct task_struct *p)
@@ -101255,582 +98551,24 @@ index 044e1eed7..f2657245e 100644
  	}
  	if (sysctl_oom_dump_tasks)
  		dump_tasks(oc);
-diff --git a/mm/page_alloc.c b/mm/page_alloc.c
-index 47421bedc..e20ef7a00 100644
---- a/mm/page_alloc.c
-+++ b/mm/page_alloc.c
-@@ -74,6 +74,7 @@
- #include <linux/psi.h>
- #include <linux/khugepaged.h>
- #include <linux/delayacct.h>
-+#include <linux/pgalloc_tag.h>
- #include <asm/sections.h>
- #include <asm/tlbflush.h>
- #include <asm/div64.h>
-@@ -1259,6 +1260,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
- 			__memcg_kmem_uncharge_page(page, order);
- 		reset_page_owner(page, order);
- 		page_table_check_free(page, order);
-+		pgalloc_tag_sub(page, order);
- 		return false;
- 	}
- 
-@@ -1301,6 +1303,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
- 	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- 	reset_page_owner(page, order);
- 	page_table_check_free(page, order);
-+	pgalloc_tag_sub(page, order);
- 
- 	if (!PageHighMem(page)) {
- 		debug_check_no_locks_freed(page_address(page),
-@@ -1730,6 +1733,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
- 
- 	set_page_owner(page, order, gfp_flags);
- 	page_table_check_alloc(page, order);
-+	pgalloc_tag_add(page, current, order);
- }
- 
- static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
-@@ -2790,6 +2794,7 @@ void split_page(struct page *page, unsigned int order)
- 	for (i = 1; i < (1 << order); i++)
- 		set_page_refcounted(page + i);
- 	split_page_owner(page, 1 << order);
-+	pgalloc_tag_split(page, 1 << order);
- 	split_page_memcg(page, 1 << order);
- }
- EXPORT_SYMBOL_GPL(split_page);
-@@ -4577,7 +4582,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
-  *
-  * Returns the number of pages on the list or array.
-  */
--unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
-+unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
- 			nodemask_t *nodemask, int nr_pages,
- 			struct list_head *page_list,
- 			struct page **page_array)
-@@ -4713,7 +4718,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
- 	pcp_trylock_finish(UP_flags);
- 
- failed:
--	page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
-+	page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask);
- 	if (page) {
- 		if (page_list)
- 			list_add(&page->lru, page_list);
-@@ -4724,13 +4729,13 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
- 
- 	goto out;
- }
--EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
-+EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
- 
- /*
-  * This is the 'heart' of the zoned buddy allocator.
-  */
--struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
--							nodemask_t *nodemask)
-+struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
-+				      int preferred_nid, nodemask_t *nodemask)
- {
- 	struct page *page;
- 	unsigned int alloc_flags = ALLOC_WMARK_LOW;
-@@ -4792,41 +4797,41 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
- 
- 	return page;
- }
--EXPORT_SYMBOL(__alloc_pages);
-+EXPORT_SYMBOL(__alloc_pages_noprof);
- 
--struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
-+struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
- 		nodemask_t *nodemask)
- {
--	struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
-+	struct page *page = __alloc_pages_noprof(gfp | __GFP_COMP, order,
- 			preferred_nid, nodemask);
- 
- 	if (page && order > 1)
- 		prep_transhuge_page(page);
- 	return (struct folio *)page;
- }
--EXPORT_SYMBOL(__folio_alloc);
-+EXPORT_SYMBOL(__folio_alloc_noprof);
- 
- /*
-  * Common helper functions. Never use with __GFP_HIGHMEM because the returned
-  * address cannot represent highmem pages. Use alloc_pages and then kmap if
-  * you need to access high mem.
-  */
--unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
-+unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order)
- {
- 	struct page *page;
- 
--	page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
-+	page = alloc_pages_noprof(gfp_mask & ~__GFP_HIGHMEM, order);
- 	if (!page)
- 		return 0;
- 	return (unsigned long) page_address(page);
- }
--EXPORT_SYMBOL(__get_free_pages);
-+EXPORT_SYMBOL(get_free_pages_noprof);
- 
--unsigned long get_zeroed_page(gfp_t gfp_mask)
-+unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
- {
--	return __get_free_page(gfp_mask | __GFP_ZERO);
-+	return get_free_pages_noprof(gfp_mask | __GFP_ZERO, 0);
- }
--EXPORT_SYMBOL(get_zeroed_page);
-+EXPORT_SYMBOL(get_zeroed_page_noprof);
- 
- /**
-  * __free_pages - Free pages allocated with alloc_pages().
-@@ -5006,6 +5011,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
- 		struct page *last = page + nr;
- 
- 		split_page_owner(page, 1 << order);
-+		pgalloc_tag_split(page, 1 << order);
- 		split_page_memcg(page, 1 << order);
- 		while (page < --last)
- 			set_page_refcounted(last);
-@@ -5018,7 +5024,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
- }
- 
- /**
-- * alloc_pages_exact - allocate an exact number physically-contiguous pages.
-+ * alloc_pages_exact_noprof - allocate an exact number physically-contiguous pages.
-  * @size: the number of bytes to allocate
-  * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
-  *
-@@ -5032,7 +5038,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
-  *
-  * Return: pointer to the allocated area or %NULL in case of error.
-  */
--void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
-+void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask)
- {
- 	unsigned int order = get_order(size);
- 	unsigned long addr;
-@@ -5040,13 +5046,13 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
- 	if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
- 		gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
- 
--	addr = __get_free_pages(gfp_mask, order);
-+	addr = get_free_pages_noprof(gfp_mask, order);
- 	return make_alloc_exact(addr, order, size);
- }
--EXPORT_SYMBOL(alloc_pages_exact);
-+EXPORT_SYMBOL(alloc_pages_exact_noprof);
- 
- /**
-- * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
-+ * alloc_pages_exact_nid_noprof - allocate an exact number of physically-contiguous
-  *			   pages on a node.
-  * @nid: the preferred node ID where memory should be allocated
-  * @size: the number of bytes to allocate
-@@ -5057,7 +5063,7 @@ EXPORT_SYMBOL(alloc_pages_exact);
-  *
-  * Return: pointer to the allocated area or %NULL in case of error.
-  */
--void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
-+void * __meminit alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask)
- {
- 	unsigned int order = get_order(size);
- 	struct page *p;
-@@ -5065,7 +5071,7 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
- 	if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
- 		gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
- 
--	p = alloc_pages_node(nid, gfp_mask, order);
-+	p = alloc_pages_node_noprof(nid, gfp_mask, order);
- 	if (!p)
- 		return NULL;
- 	return make_alloc_exact((unsigned long)page_address(p), order, size);
-@@ -6738,7 +6744,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
- }
- 
- /**
-- * alloc_contig_range() -- tries to allocate given range of pages
-+ * alloc_contig_range_noprof() -- tries to allocate given range of pages
-  * @start:	start PFN to allocate
-  * @end:	one-past-the-last PFN to allocate
-  * @migratetype:	migratetype of the underlying pageblocks (either
-@@ -6758,7 +6764,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
-  * pages which PFN is in [start, end) are allocated for the caller and
-  * need to be freed with free_contig_range().
-  */
--int alloc_contig_range(unsigned long start, unsigned long end,
-+int alloc_contig_range_noprof(unsigned long start, unsigned long end,
- 		       unsigned migratetype, gfp_t gfp_mask)
- {
- 	unsigned long outer_start, outer_end;
-@@ -6882,15 +6888,15 @@ int alloc_contig_range(unsigned long start, unsigned long end,
- 	undo_isolate_page_range(start, end, migratetype);
- 	return ret;
- }
--EXPORT_SYMBOL(alloc_contig_range);
-+EXPORT_SYMBOL(alloc_contig_range_noprof);
- 
- static int __alloc_contig_pages(unsigned long start_pfn,
- 				unsigned long nr_pages, gfp_t gfp_mask)
- {
- 	unsigned long end_pfn = start_pfn + nr_pages;
- 
--	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
--				  gfp_mask);
-+	return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE,
-+				   gfp_mask);
- }
- 
- static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
-@@ -6925,7 +6931,7 @@ static bool zone_spans_last_pfn(const struct zone *zone,
- }
- 
- /**
-- * alloc_contig_pages() -- tries to find and allocate contiguous range of pages
-+ * alloc_contig_pages_noprof() -- tries to find and allocate contiguous range of pages
-  * @nr_pages:	Number of contiguous pages to allocate
-  * @gfp_mask:	GFP mask to limit search and used during compaction
-  * @nid:	Target node
-@@ -6945,8 +6951,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
-  *
-  * Return: pointer to contiguous pages on success, or NULL if not successful.
-  */
--struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
--				int nid, nodemask_t *nodemask)
-+struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
-+				 int nid, nodemask_t *nodemask)
- {
- 	unsigned long ret, pfn, flags;
- 	struct zonelist *zonelist;
-diff --git a/mm/page_ext.c b/mm/page_ext.c
-index dc1626be4..6c8ad6e12 100644
---- a/mm/page_ext.c
-+++ b/mm/page_ext.c
-@@ -10,6 +10,7 @@
- #include <linux/page_idle.h>
- #include <linux/page_table_check.h>
- #include <linux/rcupdate.h>
-+#include <linux/pgalloc_tag.h>
- 
- /*
-  * struct page extension
-@@ -82,6 +83,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
- #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
- 	&page_idle_ops,
- #endif
-+#ifdef CONFIG_MEM_ALLOC_PROFILING
-+	&page_alloc_tagging_ops,
-+#endif
- #ifdef CONFIG_PAGE_TABLE_CHECK
- 	&page_table_check_ops,
- #endif
-@@ -92,7 +96,16 @@ unsigned long page_ext_size;
- static unsigned long total_usage;
- static struct page_ext *lookup_page_ext(const struct page *page);
- 
-+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
-+/*
-+ * To ensure correct allocation tagging for pages, page_ext should be available
-+ * before the first page allocation. Otherwise early task stacks will be
-+ * allocated before page_ext initialization and missing tags will be flagged.
-+ */
-+bool early_page_ext __meminitdata = true;
-+#else
- bool early_page_ext __meminitdata;
-+#endif
- static int __init setup_early_page_ext(char *str)
- {
- 	early_page_ext = true;
-diff --git a/mm/page_owner.c b/mm/page_owner.c
-index 31169b3e7..8b6086c66 100644
---- a/mm/page_owner.c
-+++ b/mm/page_owner.c
-@@ -372,7 +372,7 @@ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
- 	if (!memcg_data)
- 		goto out_unlock;
- 
--	if (memcg_data & MEMCG_DATA_OBJCGS)
-+	if (memcg_data & MEMCG_DATA_OBJEXTS)
- 		ret += scnprintf(kbuf + ret, count - ret,
- 				"Slab cache page\n");
- 
-diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
-index f9847c131..c5d1d6723 100644
---- a/mm/percpu-internal.h
-+++ b/mm/percpu-internal.h
-@@ -32,6 +32,19 @@ struct pcpu_block_md {
- 	int			nr_bits;	/* total bits responsible for */
- };
- 
-+struct pcpuobj_ext {
-+#ifdef CONFIG_MEMCG_KMEM
-+	struct obj_cgroup	*cgroup;
-+#endif
-+#ifdef CONFIG_MEM_ALLOC_PROFILING
-+	union codetag_ref	tag;
-+#endif
-+};
-+
-+#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MEM_ALLOC_PROFILING)
-+#define NEED_PCPUOBJ_EXT
-+#endif
-+
- struct pcpu_chunk {
- #ifdef CONFIG_PERCPU_STATS
- 	int			nr_alloc;	/* # of allocations */
-@@ -57,8 +70,8 @@ struct pcpu_chunk {
- 	int			end_offset;	/* additional area required to
- 						   have the region end page
- 						   aligned */
--#ifdef CONFIG_MEMCG_KMEM
--	struct obj_cgroup	**obj_cgroups;	/* vector of object cgroups */
-+#ifdef NEED_PCPUOBJ_EXT
-+	struct pcpuobj_ext	*obj_exts;	/* vector of object cgroups */
- #endif
- 
- 	int			nr_pages;	/* # of pages served by this chunk */
-@@ -67,6 +80,15 @@ struct pcpu_chunk {
- 	unsigned long		populated[];	/* populated bitmap */
- };
- 
-+static inline bool need_pcpuobj_ext(void)
-+{
-+	if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING))
-+		return true;
-+	if (!mem_cgroup_kmem_disabled())
-+		return true;
-+	return false;
-+}
-+
- extern spinlock_t pcpu_lock;
- 
- extern struct list_head *pcpu_chunk_lists;
-diff --git a/mm/percpu.c b/mm/percpu.c
-index 28e07ede4..2298f38d4 100644
---- a/mm/percpu.c
-+++ b/mm/percpu.c
-@@ -1392,9 +1392,9 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
- 		panic("%s: Failed to allocate %zu bytes\n", __func__,
- 		      alloc_size);
- 
--#ifdef CONFIG_MEMCG_KMEM
-+#ifdef NEED_PCPUOBJ_EXT
- 	/* first chunk is free to use */
--	chunk->obj_cgroups = NULL;
-+	chunk->obj_exts = NULL;
- #endif
- 	pcpu_init_md_blocks(chunk);
- 
-@@ -1463,12 +1463,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
- 	if (!chunk->md_blocks)
- 		goto md_blocks_fail;
- 
--#ifdef CONFIG_MEMCG_KMEM
--	if (!mem_cgroup_kmem_disabled()) {
--		chunk->obj_cgroups =
-+#ifdef NEED_PCPUOBJ_EXT
-+	if (need_pcpuobj_ext()) {
-+		chunk->obj_exts =
- 			pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
--					sizeof(struct obj_cgroup *), gfp);
--		if (!chunk->obj_cgroups)
-+					sizeof(struct pcpuobj_ext), gfp);
-+		if (!chunk->obj_exts)
- 			goto objcg_fail;
- 	}
- #endif
-@@ -1480,7 +1480,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
- 
- 	return chunk;
- 
--#ifdef CONFIG_MEMCG_KMEM
-+#ifdef NEED_PCPUOBJ_EXT
- objcg_fail:
- 	pcpu_mem_free(chunk->md_blocks);
- #endif
-@@ -1498,8 +1498,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
- {
- 	if (!chunk)
- 		return;
--#ifdef CONFIG_MEMCG_KMEM
--	pcpu_mem_free(chunk->obj_cgroups);
-+#ifdef NEED_PCPUOBJ_EXT
-+	pcpu_mem_free(chunk->obj_exts);
- #endif
- 	pcpu_mem_free(chunk->md_blocks);
- 	pcpu_mem_free(chunk->bound_map);
-@@ -1648,8 +1648,8 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
- 	if (!objcg)
- 		return;
- 
--	if (likely(chunk && chunk->obj_cgroups)) {
--		chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
-+	if (likely(chunk && chunk->obj_exts)) {
-+		chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg;
- 
- 		rcu_read_lock();
- 		mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
-@@ -1665,13 +1665,13 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
- {
- 	struct obj_cgroup *objcg;
- 
--	if (unlikely(!chunk->obj_cgroups))
-+	if (unlikely(!chunk->obj_exts))
- 		return;
- 
--	objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
-+	objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup;
- 	if (!objcg)
- 		return;
--	chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
-+	chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL;
- 
- 	obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
- 
-@@ -1701,8 +1701,34 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
- }
- #endif /* CONFIG_MEMCG_KMEM */
- 
-+#ifdef CONFIG_MEM_ALLOC_PROFILING
-+static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
-+				      size_t size)
-+{
-+	if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) {
-+		alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag,
-+			      current->alloc_tag, size);
-+	}
-+}
-+
-+static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
-+{
-+	if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts))
-+		alloc_tag_sub_noalloc(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size);
-+}
-+#else
-+static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
-+				      size_t size)
-+{
-+}
-+
-+static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
-+{
-+}
-+#endif
-+
- /**
-- * pcpu_alloc - the percpu allocator
-+ * pcpu_alloc_noprof - the percpu allocator
-  * @size: size of area to allocate in bytes
-  * @align: alignment of area (max PAGE_SIZE)
-  * @reserved: allocate from the reserved chunk if available
-@@ -1716,7 +1742,7 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
-  * RETURNS:
-  * Percpu pointer to the allocated area on success, NULL on failure.
-  */
--static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
-+void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
- 				 gfp_t gfp)
- {
- 	gfp_t pcpu_gfp;
-@@ -1883,6 +1909,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
- 
- 	pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
- 
-+	pcpu_alloc_tag_alloc_hook(chunk, off, size);
-+
- 	return ptr;
- 
- fail_unlock:
-@@ -1909,61 +1937,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
- 
- 	return NULL;
- }
--
--/**
-- * __alloc_percpu_gfp - allocate dynamic percpu area
-- * @size: size of area to allocate in bytes
-- * @align: alignment of area (max PAGE_SIZE)
-- * @gfp: allocation flags
-- *
-- * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
-- * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
-- * be called from any context but is a lot more likely to fail. If @gfp
-- * has __GFP_NOWARN then no warning will be triggered on invalid or failed
-- * allocation requests.
-- *
-- * RETURNS:
-- * Percpu pointer to the allocated area on success, NULL on failure.
-- */
--void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
--{
--	return pcpu_alloc(size, align, false, gfp);
--}
--EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
--
--/**
-- * __alloc_percpu - allocate dynamic percpu area
-- * @size: size of area to allocate in bytes
-- * @align: alignment of area (max PAGE_SIZE)
-- *
-- * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
-- */
--void __percpu *__alloc_percpu(size_t size, size_t align)
--{
--	return pcpu_alloc(size, align, false, GFP_KERNEL);
--}
--EXPORT_SYMBOL_GPL(__alloc_percpu);
--
--/**
-- * __alloc_reserved_percpu - allocate reserved percpu area
-- * @size: size of area to allocate in bytes
-- * @align: alignment of area (max PAGE_SIZE)
-- *
-- * Allocate zero-filled percpu area of @size bytes aligned at @align
-- * from reserved percpu area if arch has set it up; otherwise,
-- * allocation is served from the same dynamic area.  Might sleep.
-- * Might trigger writeouts.
-- *
-- * CONTEXT:
-- * Does GFP_KERNEL allocation.
-- *
-- * RETURNS:
-- * Percpu pointer to the allocated area on success, NULL on failure.
-- */
--void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
--{
--	return pcpu_alloc(size, align, true, GFP_KERNEL);
--}
-+EXPORT_SYMBOL_GPL(pcpu_alloc_noprof);
- 
- /**
-  * pcpu_balance_free - manage the amount of free chunks
-@@ -2273,6 +2247,8 @@ void free_percpu(void __percpu *ptr)
- 
- 	size = pcpu_free_area(chunk, off);
- 
-+	pcpu_alloc_tag_free_hook(chunk, off, size);
-+
- 	pcpu_memcg_free_hook(chunk, off, size);
- 
- 	/*
-diff --git a/lib/show_mem.c b/mm/show_mem.c
-similarity index 57%
-rename from lib/show_mem.c
-rename to mm/show_mem.c
-index 1485c87be..de209c55d 100644
---- a/lib/show_mem.c
+diff --git a/mm/show_mem.c b/mm/show_mem.c
+index 01f8e9905..94ebd86c8 100644
+--- a/mm/show_mem.c
 +++ b/mm/show_mem.c
-@@ -7,11 +7,15 @@
- 
+@@ -12,10 +12,12 @@
+ #include <linux/hugetlb.h>
  #include <linux/mm.h>
- #include <linux/cma.h>
+ #include <linux/mmzone.h>
 +#include <linux/seq_buf.h>
-+
-+#include "slab.h"
+ #include <linux/swap.h>
+ #include <linux/vmstat.h>
  
- void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
+ #include "internal.h"
++#include "slab.h"
+ #include "swap.h"
+ 
+ atomic_long_t _totalram_pages __read_mostly;
+@@ -404,6 +406,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
  {
  	unsigned long total = 0, reserved = 0, highmem = 0;
  	struct zone *zone;
@@ -101838,7 +98576,7 @@ index 1485c87be..de209c55d 100644
  
  	printk("Mem-Info:\n");
  	__show_free_areas(filter, nodemask, max_zone_idx);
-@@ -34,4 +38,37 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
+@@ -426,4 +429,23 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
  #ifdef CONFIG_MEMORY_FAILURE
  	printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
  #endif
@@ -101861,470 +98599,12 @@ index 1485c87be..de209c55d 100644
 +
 +		kfree(buf);
 +	}
-+#ifdef CONFIG_MEM_ALLOC_PROFILING
-+	{
-+		struct seq_buf s;
-+		char *buf = kmalloc(4096, GFP_ATOMIC);
-+
-+		if (buf) {
-+			printk("Memory allocations:\n");
-+			seq_buf_init(&s, buf, 4096);
-+			alloc_tags_show_mem_report(&s);
-+			printk("%s", buf);
-+			kfree(buf);
-+		}
-+	}
-+#endif
  }
-diff --git a/mm/slab.c b/mm/slab.c
-index bb57f7fdb..d02d2dd27 100644
---- a/mm/slab.c
-+++ b/mm/slab.c
-@@ -1232,7 +1232,7 @@ void __init kmem_cache_init(void)
- 	create_boot_cache(kmem_cache, "kmem_cache",
- 		offsetof(struct kmem_cache, node) +
- 				  nr_node_ids * sizeof(struct kmem_cache_node *),
--				  SLAB_HWCACHE_ALIGN, 0, 0);
-+				  SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
- 	list_add(&kmem_cache->list, &slab_caches);
- 	slab_state = PARTIAL;
- 
-@@ -3367,9 +3367,11 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
- static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
- 					 unsigned long caller)
- {
-+	struct slab *slab = virt_to_slab(objp);
- 	bool init;
- 
--	memcg_slab_free_hook(cachep, virt_to_slab(objp), &objp, 1);
-+	memcg_slab_free_hook(cachep, slab, &objp, 1);
-+	alloc_tagging_slab_free_hook(cachep, slab, &objp, 1);
- 
- 	if (is_kfence_address(objp)) {
- 		kmemleak_free_recursive(objp, cachep->flags);
-@@ -3446,18 +3448,18 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
- 	return ret;
- }
- 
--void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
-+void *kmem_cache_alloc_noprof(struct kmem_cache *cachep, gfp_t flags)
- {
- 	return __kmem_cache_alloc_lru(cachep, NULL, flags);
- }
--EXPORT_SYMBOL(kmem_cache_alloc);
-+EXPORT_SYMBOL(kmem_cache_alloc_noprof);
- 
--void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
-+void *kmem_cache_alloc_lru_noprof(struct kmem_cache *cachep, struct list_lru *lru,
- 			   gfp_t flags)
- {
- 	return __kmem_cache_alloc_lru(cachep, lru, flags);
- }
--EXPORT_SYMBOL(kmem_cache_alloc_lru);
-+EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof);
- 
- static __always_inline void
- cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
-@@ -3469,8 +3471,8 @@ cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
- 		p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
- }
- 
--int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
--			  void **p)
-+int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
-+				 void **p)
- {
- 	struct obj_cgroup *objcg = NULL;
- 	unsigned long irqflags;
-@@ -3508,7 +3510,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- 	kmem_cache_free_bulk(s, i, p);
- 	return 0;
- }
--EXPORT_SYMBOL(kmem_cache_alloc_bulk);
-+EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof);
- 
- /**
-  * kmem_cache_alloc_node - Allocate an object on the specified node
-@@ -3523,7 +3525,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk);
-  *
-  * Return: pointer to the new object or %NULL in case of error
-  */
--void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
-+void *kmem_cache_alloc_node_noprof(struct kmem_cache *cachep, gfp_t flags, int nodeid)
- {
- 	void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_);
- 
-@@ -3531,7 +3533,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
- 
- 	return ret;
- }
--EXPORT_SYMBOL(kmem_cache_alloc_node);
-+EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
- 
- void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
- 			     int nodeid, size_t orig_size,
 diff --git a/mm/slab.h b/mm/slab.h
-index f01ac256a..bc2d3429d 100644
+index 9c0e09d0f..7bcf32b47 100644
 --- a/mm/slab.h
 +++ b/mm/slab.h
-@@ -57,8 +57,8 @@ struct slab {
- #endif
- 
- 	atomic_t __page_refcount;
--#ifdef CONFIG_MEMCG
--	unsigned long memcg_data;
-+#ifdef CONFIG_SLAB_OBJ_EXT
-+	unsigned long obj_exts;
- #endif
- };
- 
-@@ -67,8 +67,8 @@ struct slab {
- SLAB_MATCH(flags, __page_flags);
- SLAB_MATCH(compound_head, slab_cache);	/* Ensure bit 0 is clear */
- SLAB_MATCH(_refcount, __page_refcount);
--#ifdef CONFIG_MEMCG
--SLAB_MATCH(memcg_data, memcg_data);
-+#ifdef CONFIG_SLAB_OBJ_EXT
-+SLAB_MATCH(memcg_data, obj_exts);
- #endif
- #undef SLAB_MATCH
- static_assert(sizeof(struct slab) <= sizeof(struct page));
-@@ -390,36 +390,198 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
- 	return false;
- }
- 
--#ifdef CONFIG_MEMCG_KMEM
-+#ifdef CONFIG_SLAB_OBJ_EXT
-+
- /*
-- * slab_objcgs - get the object cgroups vector associated with a slab
-+ * slab_obj_exts - get the pointer to the slab object extension vector
-+ * associated with a slab.
-  * @slab: a pointer to the slab struct
-  *
-- * Returns a pointer to the object cgroups vector associated with the slab,
-+ * Returns a pointer to the object extension vector associated with the slab,
-  * or NULL if no such vector has been associated yet.
-  */
--static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
-+static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
- {
--	unsigned long memcg_data = READ_ONCE(slab->memcg_data);
-+	unsigned long obj_exts = READ_ONCE(slab->obj_exts);
- 
--	VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS),
-+#ifdef CONFIG_MEMCG
-+	VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS),
- 							slab_page(slab));
--	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab));
-+	VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab));
- 
--	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
-+#endif
-+	return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK);
- }
- 
--int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
--				 gfp_t gfp, bool new_slab);
--void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
--		     enum node_stat_item idx, int nr);
-+int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
-+			gfp_t gfp, bool new_slab);
-+
-+
-+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
-+
-+static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
-+{
-+	struct slabobj_ext *slab_exts;
-+	struct slab *obj_exts_slab;
-+
-+	obj_exts_slab = virt_to_slab(obj_exts);
-+	slab_exts = slab_obj_exts(obj_exts_slab);
-+	if (slab_exts) {
-+		unsigned int offs = obj_to_index(obj_exts_slab->slab_cache,
-+						 obj_exts_slab, obj_exts);
-+		/* codetag should be NULL */
-+		WARN_ON(slab_exts[offs].ref.ct);
-+		set_codetag_empty(&slab_exts[offs].ref);
-+	}
-+}
-+
-+static inline void mark_failed_objexts_alloc(struct slab *slab)
-+{
-+	slab->obj_exts = OBJEXTS_ALLOC_FAIL;
-+}
-+
-+static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
-+			struct slabobj_ext *vec, unsigned int objects)
-+{
-+	/*
-+	 * If vector previously failed to allocate then we have live
-+	 * objects with no tag reference. Mark all references in this
-+	 * vector as empty to avoid warnings later on.
-+	 */
-+	if (obj_exts & OBJEXTS_ALLOC_FAIL) {
-+		unsigned int i;
-+
-+		for (i = 0; i < objects; i++)
-+			set_codetag_empty(&vec[i].ref);
-+	}
-+}
-+
-+
-+#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
-+
-+static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {}
-+static inline void mark_failed_objexts_alloc(struct slab *slab) {}
-+static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
-+			struct slabobj_ext *vec, unsigned int objects) {}
-+
-+#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
-+
-+static inline bool need_slab_obj_ext(void)
-+{
-+#ifdef CONFIG_MEM_ALLOC_PROFILING
-+	if (mem_alloc_profiling_enabled())
-+		return true;
-+#endif
-+	/*
-+	 * CONFIG_MEMCG_KMEM creates vector of obj_cgroup objects conditionally
-+	 * inside memcg_slab_post_alloc_hook. No other users for now.
-+	 */
-+	return false;
-+}
-+
-+static inline void free_slab_obj_exts(struct slab *slab)
-+{
-+	struct slabobj_ext *obj_exts;
-+
-+	obj_exts = slab_obj_exts(slab);
-+	if (!obj_exts)
-+		return;
-+
-+	/*
-+	 * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its
-+	 * corresponding extension will be NULL. alloc_tag_sub() will throw a
-+	 * warning if slab has extensions but the extension of an object is
-+	 * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that
-+	 * the extension for obj_exts is expected to be NULL.
-+	 */
-+	mark_objexts_empty(obj_exts);
-+	kfree(obj_exts);
-+	slab->obj_exts = 0;
-+}
-+
-+static inline struct slabobj_ext *
-+prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
-+{
-+	struct slab *slab;
-+
-+	if (!p)
-+		return NULL;
-+
-+	if (!need_slab_obj_ext())
-+		return NULL;
-+
-+	if (s->flags & SLAB_NO_OBJ_EXT)
-+		return NULL;
- 
--static inline void memcg_free_slab_cgroups(struct slab *slab)
-+	if (flags & __GFP_NO_OBJ_EXT)
-+		return NULL;
-+
-+	slab = virt_to_slab(p);
-+	if (!slab_obj_exts(slab) &&
-+	    WARN(alloc_slab_obj_exts(slab, s, flags, false),
-+		 "%s, %s: Failed to create slab extension vector!\n",
-+		 __func__, s->name))
-+		return NULL;
-+
-+	return slab_obj_exts(slab) + obj_to_index(s, slab, p);
-+}
-+
-+#else /* CONFIG_SLAB_OBJ_EXT */
-+
-+static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
- {
--	kfree(slab_objcgs(slab));
--	slab->memcg_data = 0;
-+	return NULL;
-+}
-+
-+static inline int alloc_slab_obj_exts(struct slab *slab,
-+				      struct kmem_cache *s, gfp_t gfp,
-+				      bool new_slab)
-+{
-+	return 0;
-+}
-+
-+static inline void free_slab_obj_exts(struct slab *slab)
-+{
-+}
-+
-+static inline struct slabobj_ext *
-+prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p)
-+{
-+	return NULL;
-+}
-+
-+#endif /* CONFIG_SLAB_OBJ_EXT */
-+
-+#ifdef CONFIG_MEM_ALLOC_PROFILING
-+
-+static inline void alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab,
-+					void **p, int objects)
-+{
-+	struct slabobj_ext *obj_exts;
-+	int i;
-+
-+	obj_exts = slab_obj_exts(slab);
-+	if (!obj_exts)
-+		return;
-+
-+	for (i = 0; i < objects; i++) {
-+		unsigned int off = obj_to_index(s, slab, p[i]);
-+
-+		alloc_tag_sub(&obj_exts[off].ref, s->size);
-+	}
- }
- 
-+#else
-+
-+static inline void alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab,
-+					void **p, int objects) {}
-+
-+#endif /* CONFIG_MEM_ALLOC_PROFILING */
-+
-+#ifdef CONFIG_MEMCG_KMEM
-+void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
-+		     enum node_stat_item idx, int nr);
-+
- static inline size_t obj_full_size(struct kmem_cache *s)
- {
- 	/*
-@@ -487,16 +649,15 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
- 		if (likely(p[i])) {
- 			slab = virt_to_slab(p[i]);
- 
--			if (!slab_objcgs(slab) &&
--			    memcg_alloc_slab_cgroups(slab, s, flags,
--							 false)) {
-+			if (!slab_obj_exts(slab) &&
-+			    alloc_slab_obj_exts(slab, s, flags, false)) {
- 				obj_cgroup_uncharge(objcg, obj_full_size(s));
- 				continue;
- 			}
- 
- 			off = obj_to_index(s, slab, p[i]);
- 			obj_cgroup_get(objcg);
--			slab_objcgs(slab)[off] = objcg;
-+			slab_obj_exts(slab)[off].objcg = objcg;
- 			mod_objcg_state(objcg, slab_pgdat(slab),
- 					cache_vmstat_idx(s), obj_full_size(s));
- 		} else {
-@@ -509,14 +670,14 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
- static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
- 					void **p, int objects)
- {
--	struct obj_cgroup **objcgs;
-+	struct slabobj_ext *obj_exts;
- 	int i;
- 
- 	if (!memcg_kmem_online())
- 		return;
- 
--	objcgs = slab_objcgs(slab);
--	if (!objcgs)
-+	obj_exts = slab_obj_exts(slab);
-+	if (!obj_exts)
- 		return;
- 
- 	for (i = 0; i < objects; i++) {
-@@ -524,11 +685,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
- 		unsigned int off;
- 
- 		off = obj_to_index(s, slab, p[i]);
--		objcg = objcgs[off];
-+		objcg = obj_exts[off].objcg;
- 		if (!objcg)
- 			continue;
- 
--		objcgs[off] = NULL;
-+		obj_exts[off].objcg = NULL;
- 		obj_cgroup_uncharge(objcg, obj_full_size(s));
- 		mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
- 				-obj_full_size(s));
-@@ -537,27 +698,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
- }
- 
- #else /* CONFIG_MEMCG_KMEM */
--static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
--{
--	return NULL;
--}
--
- static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
- {
- 	return NULL;
- }
- 
--static inline int memcg_alloc_slab_cgroups(struct slab *slab,
--					       struct kmem_cache *s, gfp_t gfp,
--					       bool new_slab)
--{
--	return 0;
--}
--
--static inline void memcg_free_slab_cgroups(struct slab *slab)
--{
--}
--
- static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
- 					     struct list_lru *lru,
- 					     struct obj_cgroup **objcgp,
-@@ -594,7 +739,7 @@ static __always_inline void account_slab(struct slab *slab, int order,
- 					 struct kmem_cache *s, gfp_t gfp)
- {
- 	if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
--		memcg_alloc_slab_cgroups(slab, s, gfp, true);
-+		alloc_slab_obj_exts(slab, s, gfp, true);
- 
- 	mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
- 			    PAGE_SIZE << order);
-@@ -603,8 +748,7 @@ static __always_inline void account_slab(struct slab *slab, int order,
- static __always_inline void unaccount_slab(struct slab *slab, int order,
- 					   struct kmem_cache *s)
- {
--	if (memcg_kmem_online())
--		memcg_free_slab_cgroups(slab);
-+	free_slab_obj_exts(slab);
- 
- 	mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
- 			    -(PAGE_SIZE << order));
-@@ -684,6 +828,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
- 					unsigned int orig_size)
- {
- 	unsigned int zero_size = s->object_size;
-+	struct slabobj_ext *obj_exts;
- 	size_t i;
- 
- 	flags &= gfp_allowed_mask;
-@@ -714,6 +859,13 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
- 		kmemleak_alloc_recursive(p[i], s->object_size, 1,
- 					 s->flags, flags);
- 		kmsan_slab_alloc(s, p[i], flags);
-+		obj_exts = prepare_slab_obj_exts_hook(s, flags, p[i]);
-+
-+#ifdef CONFIG_MEM_ALLOC_PROFILING
-+		/* obj_exts can be allocated for other reasons */
-+		if (likely(obj_exts) && mem_alloc_profiling_enabled())
-+			alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size);
-+#endif
- 	}
- 
- 	memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
-@@ -766,10 +918,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
+@@ -817,10 +817,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
  		 if ((__n = get_node(__s, __node)))
  
  
@@ -102340,10 +98620,10 @@ index f01ac256a..bc2d3429d 100644
  }
  #endif
 diff --git a/mm/slab_common.c b/mm/slab_common.c
-index 607249785..5b204e16f 100644
+index d1555ea29..fbd6b879d 100644
 --- a/mm/slab_common.c
 +++ b/mm/slab_common.c
-@@ -24,6 +24,7 @@
+@@ -26,6 +26,7 @@
  #include <asm/tlbflush.h>
  #include <asm/page.h>
  #include <linux/memcontrol.h>
@@ -102351,173 +98631,7 @@ index 607249785..5b204e16f 100644
  #include <linux/stackdepot.h>
  
  #include "internal.h"
-@@ -204,6 +205,64 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
- 	return NULL;
- }
- 
-+#ifdef CONFIG_SLAB_OBJ_EXT
-+/*
-+ * The allocated objcg pointers array is not accounted directly.
-+ * Moreover, it should not come from DMA buffer and is not readily
-+ * reclaimable. So those GFP bits should be masked off.
-+ */
-+#define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
-+
-+int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
-+			gfp_t gfp, bool new_slab)
-+{
-+	unsigned int objects = objs_per_slab(s, slab);
-+	unsigned long new_exts;
-+	unsigned long old_exts;
-+	struct slabobj_ext *vec;
-+
-+	gfp &= ~OBJCGS_CLEAR_MASK;
-+	/* Prevent recursive extension vector allocation */
-+	gfp |= __GFP_NO_OBJ_EXT;
-+	vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
-+			   slab_nid(slab));
-+	if (!vec) {
-+		/* Mark vectors which failed to allocate */
-+		if (new_slab)
-+			mark_failed_objexts_alloc(slab);
-+
-+		return -ENOMEM;
-+	}
-+
-+	new_exts = (unsigned long)vec;
-+#ifdef CONFIG_MEMCG
-+	new_exts |= MEMCG_DATA_OBJEXTS;
-+#endif
-+	old_exts = slab->obj_exts;
-+	handle_failed_objexts_alloc(old_exts, vec, objects);
-+	if (new_slab) {
-+		/*
-+		 * If the slab is brand new and nobody can yet access its
-+		 * obj_exts, no synchronization is required and obj_exts can
-+		 * be simply assigned.
-+		 */
-+		slab->obj_exts = new_exts;
-+	} else if (cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) {
-+		/*
-+		 * If the slab is already in use, somebody can allocate and
-+		 * assign slabobj_exts in parallel. In this case the existing
-+		 * objcg vector should be reused.
-+		 */
-+		mark_objexts_empty(vec);
-+		kfree(vec);
-+		return 0;
-+	}
-+
-+	kmemleak_not_leak(vec);
-+	return 0;
-+}
-+#endif /* CONFIG_SLAB_OBJ_EXT */
-+
- static struct kmem_cache *create_cache(const char *name,
- 		unsigned int object_size, unsigned int align,
- 		slab_flags_t flags, unsigned int useroffset,
-@@ -968,24 +1027,24 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller
- 	return ret;
- }
- 
--void *__kmalloc_node(size_t size, gfp_t flags, int node)
-+void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node)
- {
- 	return __do_kmalloc_node(size, flags, node, _RET_IP_);
- }
--EXPORT_SYMBOL(__kmalloc_node);
-+EXPORT_SYMBOL(__kmalloc_node_noprof);
- 
--void *__kmalloc(size_t size, gfp_t flags)
-+void *__kmalloc_noprof(size_t size, gfp_t flags)
- {
- 	return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_);
- }
--EXPORT_SYMBOL(__kmalloc);
-+EXPORT_SYMBOL(__kmalloc_noprof);
- 
--void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
--				  int node, unsigned long caller)
-+void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags,
-+				       int node, unsigned long caller)
- {
- 	return __do_kmalloc_node(size, flags, node, caller);
- }
--EXPORT_SYMBOL(__kmalloc_node_track_caller);
-+EXPORT_SYMBOL(kmalloc_node_track_caller_noprof);
- 
- /**
-  * kfree - free previously allocated memory
-@@ -1052,7 +1111,7 @@ size_t __ksize(const void *object)
- 	return slab_ksize(folio_slab(folio)->slab_cache);
- }
- 
--void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
-+void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size)
- {
- 	void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE,
- 					    size, _RET_IP_);
-@@ -1062,9 +1121,9 @@ void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
- 	ret = kasan_kmalloc(s, ret, size, gfpflags);
- 	return ret;
- }
--EXPORT_SYMBOL(kmalloc_trace);
-+EXPORT_SYMBOL(kmalloc_trace_noprof);
- 
--void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
-+void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags,
- 			 int node, size_t size)
- {
- 	void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_);
-@@ -1074,7 +1133,7 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
- 	ret = kasan_kmalloc(s, ret, size, gfpflags);
- 	return ret;
- }
--EXPORT_SYMBOL(kmalloc_node_trace);
-+EXPORT_SYMBOL(kmalloc_node_trace_noprof);
- 
- gfp_t kmalloc_fix_flags(gfp_t flags)
- {
-@@ -1104,7 +1163,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node)
- 		flags = kmalloc_fix_flags(flags);
- 
- 	flags |= __GFP_COMP;
--	page = alloc_pages_node(node, flags, order);
-+	page = alloc_pages_node_noprof(node, flags, order);
- 	if (page) {
- 		ptr = page_address(page);
- 		mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
-@@ -1119,7 +1178,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node)
- 	return ptr;
- }
- 
--void *kmalloc_large(size_t size, gfp_t flags)
-+void *kmalloc_large_noprof(size_t size, gfp_t flags)
- {
- 	void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE);
- 
-@@ -1127,9 +1186,9 @@ void *kmalloc_large(size_t size, gfp_t flags)
- 		      flags, NUMA_NO_NODE);
- 	return ret;
- }
--EXPORT_SYMBOL(kmalloc_large);
-+EXPORT_SYMBOL(kmalloc_large_noprof);
- 
--void *kmalloc_large_node(size_t size, gfp_t flags, int node)
-+void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
- {
- 	void *ret = __kmalloc_large_node(size, flags, node);
- 
-@@ -1137,7 +1196,7 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node)
- 		      flags, node);
- 	return ret;
- }
--EXPORT_SYMBOL(kmalloc_large_node);
-+EXPORT_SYMBOL(kmalloc_large_node_noprof);
- 
- #ifdef CONFIG_SLAB_FREELIST_RANDOM
- /* Randomize a generic freelist */
-@@ -1259,10 +1318,15 @@ static int slab_show(struct seq_file *m, void *p)
+@@ -1273,10 +1274,15 @@ static int slab_show(struct seq_file *m, void *p)
  	return 0;
  }
  
@@ -102534,7 +98648,7 @@ index 607249785..5b204e16f 100644
  
  	/*
  	 * Here acquiring slab_mutex is risky since we don't prefer to get
-@@ -1272,24 +1336,52 @@ void dump_unreclaimable_slab(void)
+@@ -1286,24 +1292,52 @@ void dump_unreclaimable_slab(void)
  	 * without acquiring the mutex.
  	 */
  	if (!mutex_trylock(&slab_mutex)) {
@@ -102582,7 +98696,7 @@ index 607249785..5b204e16f 100644
 +		}
 +
 +		slabs_by_mem[i] = n;
- 	}
++	}
 +
 +	for (i = nr - 1; i >= 0; --i) {
 +		seq_buf_printf(out, "%-17s total: ", slabs_by_mem[i].s->name);
@@ -102590,567 +98704,24 @@ index 607249785..5b204e16f 100644
 +		seq_buf_printf(out, " active: ");
 +		seq_buf_human_readable_u64(out, slabs_by_mem[i].active);
 +		seq_buf_putc(out, '\n');
-+	}
+ 	}
 +
  	mutex_unlock(&slab_mutex);
  }
  
-@@ -1356,7 +1448,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
- 		return (void *)p;
- 	}
- 
--	ret = kmalloc_track_caller(new_size, flags);
-+	ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_);
- 	if (ret && p) {
- 		/* Disable KASAN checks as the object's redzone is accessed. */
- 		kasan_disable_current();
-@@ -1380,7 +1472,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
-  *
-  * Return: pointer to the allocated memory or %NULL in case of error
-  */
--void *krealloc(const void *p, size_t new_size, gfp_t flags)
-+void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
- {
- 	void *ret;
- 
-@@ -1395,7 +1487,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
- 
- 	return ret;
- }
--EXPORT_SYMBOL(krealloc);
-+EXPORT_SYMBOL(krealloc_noprof);
- 
- /**
-  * kfree_sensitive - Clear sensitive information in memory before freeing
-diff --git a/mm/slub.c b/mm/slub.c
-index c87628cd8..768b0e292 100644
---- a/mm/slub.c
-+++ b/mm/slub.c
-@@ -1781,7 +1781,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s,
- 	return kasan_slab_free(s, x, init);
- }
- 
--static inline bool slab_free_freelist_hook(struct kmem_cache *s,
-+static __always_inline bool slab_free_freelist_hook(struct kmem_cache *s,
- 					   void **head, void **tail,
- 					   int *cnt)
- {
-@@ -3470,18 +3470,18 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
- 	return ret;
- }
- 
--void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
-+void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags)
- {
- 	return __kmem_cache_alloc_lru(s, NULL, gfpflags);
- }
--EXPORT_SYMBOL(kmem_cache_alloc);
-+EXPORT_SYMBOL(kmem_cache_alloc_noprof);
- 
--void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
-+void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
- 			   gfp_t gfpflags)
- {
- 	return __kmem_cache_alloc_lru(s, lru, gfpflags);
- }
--EXPORT_SYMBOL(kmem_cache_alloc_lru);
-+EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof);
- 
- void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
- 			      int node, size_t orig_size,
-@@ -3491,7 +3491,7 @@ void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags,
- 			       caller, orig_size);
- }
- 
--void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
-+void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node)
- {
- 	void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
- 
-@@ -3499,7 +3499,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
- 
- 	return ret;
- }
--EXPORT_SYMBOL(kmem_cache_alloc_node);
-+EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
- 
- static noinline void free_to_partial_list(
- 	struct kmem_cache *s, struct slab *slab,
-@@ -3779,6 +3779,7 @@ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab,
- 				      unsigned long addr)
- {
- 	memcg_slab_free_hook(s, slab, p, cnt);
-+	alloc_tagging_slab_free_hook(s, slab, p, cnt);
- 	/*
- 	 * With KASAN enabled slab_free_freelist_hook modifies the freelist
- 	 * to remove objects, whose reuse must be delayed.
-@@ -4009,8 +4010,8 @@ static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags,
- #endif /* CONFIG_SLUB_TINY */
- 
- /* Note that interrupts must be enabled when calling this function. */
--int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
--			  void **p)
-+int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
-+		                 void **p)
- {
- 	int i;
- 	struct obj_cgroup *objcg = NULL;
-@@ -4034,7 +4035,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- 			slab_want_init_on_alloc(flags, s), s->object_size);
- 	return i;
- }
--EXPORT_SYMBOL(kmem_cache_alloc_bulk);
-+EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof);
- 
- 
- /*
-@@ -5020,7 +5021,8 @@ void __init kmem_cache_init(void)
- 		node_set(node, slab_nodes);
- 
- 	create_boot_cache(kmem_cache_node, "kmem_cache_node",
--		sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
-+			sizeof(struct kmem_cache_node),
-+			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
- 
- 	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
- 
-@@ -5030,7 +5032,7 @@ void __init kmem_cache_init(void)
- 	create_boot_cache(kmem_cache, "kmem_cache",
- 			offsetof(struct kmem_cache, node) +
- 				nr_node_ids * sizeof(struct kmem_cache_node *),
--		       SLAB_HWCACHE_ALIGN, 0, 0);
-+			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
- 
- 	kmem_cache = bootstrap(&boot_kmem_cache);
- 	kmem_cache_node = bootstrap(&boot_kmem_cache_node);
-diff --git a/mm/util.c b/mm/util.c
-index dd12b9531..9d24b8870 100644
---- a/mm/util.c
-+++ b/mm/util.c
-@@ -115,7 +115,7 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp)
- EXPORT_SYMBOL(kstrndup);
- 
- /**
-- * kmemdup - duplicate region of memory
-+ * kmemdup_noprof - duplicate region of memory
-  *
-  * @src: memory region to duplicate
-  * @len: memory region length
-@@ -124,16 +124,16 @@ EXPORT_SYMBOL(kstrndup);
-  * Return: newly allocated copy of @src or %NULL in case of error,
-  * result is physically contiguous. Use kfree() to free.
-  */
--void *kmemdup(const void *src, size_t len, gfp_t gfp)
-+void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp)
- {
- 	void *p;
- 
--	p = kmalloc_track_caller(len, gfp);
-+	p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_);
- 	if (p)
- 		memcpy(p, src, len);
- 	return p;
- }
--EXPORT_SYMBOL(kmemdup);
-+EXPORT_SYMBOL(kmemdup_noprof);
- 
- /**
-  * kvmemdup - duplicate region of memory
-@@ -564,7 +564,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
- EXPORT_SYMBOL(vm_mmap);
- 
- /**
-- * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
-+ * kvmalloc_node_noprof - attempt to allocate physically contiguous memory, but upon
-  * failure, fall back to non-contiguous (vmalloc) allocation.
-  * @size: size of the request.
-  * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
-@@ -579,7 +579,7 @@ EXPORT_SYMBOL(vm_mmap);
-  *
-  * Return: pointer to the allocated memory of %NULL in case of failure
-  */
--void *kvmalloc_node(size_t size, gfp_t flags, int node)
-+void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node)
- {
- 	gfp_t kmalloc_flags = flags;
- 	void *ret;
-@@ -601,7 +601,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
- 		kmalloc_flags &= ~__GFP_NOFAIL;
- 	}
- 
--	ret = kmalloc_node(size, kmalloc_flags, node);
-+	ret = kmalloc_node_noprof(size, kmalloc_flags, node);
- 
- 	/*
- 	 * It doesn't really make sense to fallback to vmalloc for sub page
-@@ -626,11 +626,11 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
- 	 * about the resulting pointer, and cannot play
- 	 * protection games.
- 	 */
--	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
-+	return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- 			flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- 			node, __builtin_return_address(0));
- }
--EXPORT_SYMBOL(kvmalloc_node);
-+EXPORT_SYMBOL(kvmalloc_node_noprof);
- 
- /**
-  * kvfree() - Free memory.
-@@ -669,7 +669,7 @@ void kvfree_sensitive(const void *addr, size_t len)
- }
- EXPORT_SYMBOL(kvfree_sensitive);
- 
--void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
-+void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
- {
- 	void *newp;
- 
-@@ -682,15 +682,15 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
- 	kvfree(p);
- 	return newp;
- }
--EXPORT_SYMBOL(kvrealloc);
-+EXPORT_SYMBOL(kvrealloc_noprof);
- 
- /**
-- * __vmalloc_array - allocate memory for a virtually contiguous array.
-+ * __vmalloc_array_noprof - allocate memory for a virtually contiguous array.
-  * @n: number of elements.
-  * @size: element size.
-  * @flags: the type of memory to allocate (see kmalloc).
-  */
--void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
-+void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
- {
- 	size_t bytes;
- 
-@@ -698,18 +698,18 @@ void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
- 		return NULL;
- 	return __vmalloc(bytes, flags);
- }
--EXPORT_SYMBOL(__vmalloc_array);
-+EXPORT_SYMBOL(__vmalloc_array_noprof);
- 
- /**
-- * vmalloc_array - allocate memory for a virtually contiguous array.
-+ * vmalloc_array_noprof - allocate memory for a virtually contiguous array.
-  * @n: number of elements.
-  * @size: element size.
-  */
--void *vmalloc_array(size_t n, size_t size)
-+void *vmalloc_array_noprof(size_t n, size_t size)
- {
- 	return __vmalloc_array(n, size, GFP_KERNEL);
- }
--EXPORT_SYMBOL(vmalloc_array);
-+EXPORT_SYMBOL(vmalloc_array_noprof);
- 
- /**
-  * __vcalloc - allocate and zero memory for a virtually contiguous array.
-@@ -717,22 +717,22 @@ EXPORT_SYMBOL(vmalloc_array);
-  * @size: element size.
-  * @flags: the type of memory to allocate (see kmalloc).
-  */
--void *__vcalloc(size_t n, size_t size, gfp_t flags)
-+void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags)
- {
- 	return __vmalloc_array(n, size, flags | __GFP_ZERO);
- }
--EXPORT_SYMBOL(__vcalloc);
-+EXPORT_SYMBOL(__vcalloc_noprof);
- 
- /**
-- * vcalloc - allocate and zero memory for a virtually contiguous array.
-+ * vcalloc_noprof - allocate and zero memory for a virtually contiguous array.
-  * @n: number of elements.
-  * @size: element size.
-  */
--void *vcalloc(size_t n, size_t size)
-+void *vcalloc_noprof(size_t n, size_t size)
- {
- 	return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
- }
--EXPORT_SYMBOL(vcalloc);
-+EXPORT_SYMBOL(vcalloc_noprof);
- 
- /* Neutral page->mapping pointer to address_space or anon_vma or other */
- void *page_rmapping(struct page *page)
-diff --git a/mm/vmalloc.c b/mm/vmalloc.c
-index 1d13d7168..4c199cf9b 100644
---- a/mm/vmalloc.c
-+++ b/mm/vmalloc.c
-@@ -2971,12 +2971,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
- 			 * but mempolicy wants to alloc memory by interleaving.
- 			 */
- 			if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
--				nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
-+				nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp,
- 							nr_pages_request,
- 							pages + nr_allocated);
- 
- 			else
--				nr = alloc_pages_bulk_array_node(bulk_gfp, nid,
-+				nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid,
- 							nr_pages_request,
- 							pages + nr_allocated);
- 
-@@ -3006,9 +3006,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
- 			break;
- 
- 		if (nid == NUMA_NO_NODE)
--			page = alloc_pages(alloc_gfp, order);
-+			page = alloc_pages_noprof(alloc_gfp, order);
- 		else
--			page = alloc_pages_node(nid, alloc_gfp, order);
-+			page = alloc_pages_node_noprof(nid, alloc_gfp, order);
- 		if (unlikely(!page)) {
- 			if (!nofail)
- 				break;
-@@ -3065,10 +3065,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- 
- 	/* Please note that the recursion is strictly bounded. */
- 	if (array_size > PAGE_SIZE) {
--		area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
-+		area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node,
- 					area->caller);
- 	} else {
--		area->pages = kmalloc_node(array_size, nested_gfp, node);
-+		area->pages = kmalloc_node_noprof(array_size, nested_gfp, node);
- 	}
- 
- 	if (!area->pages) {
-@@ -3151,7 +3151,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- }
- 
- /**
-- * __vmalloc_node_range - allocate virtually contiguous memory
-+ * __vmalloc_node_range_noprof - allocate virtually contiguous memory
-  * @size:		  allocation size
-  * @align:		  desired alignment
-  * @start:		  vm area range start
-@@ -3178,7 +3178,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
-  *
-  * Return: the address of the area or %NULL on failure
-  */
--void *__vmalloc_node_range(unsigned long size, unsigned long align,
-+void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
- 			unsigned long start, unsigned long end, gfp_t gfp_mask,
- 			pgprot_t prot, unsigned long vm_flags, int node,
- 			const void *caller)
-@@ -3307,7 +3307,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
- }
- 
- /**
-- * __vmalloc_node - allocate virtually contiguous memory
-+ * __vmalloc_node_noprof - allocate virtually contiguous memory
-  * @size:	    allocation size
-  * @align:	    desired alignment
-  * @gfp_mask:	    flags for the page level allocator
-@@ -3325,10 +3325,10 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
-  *
-  * Return: pointer to the allocated memory or %NULL on error
-  */
--void *__vmalloc_node(unsigned long size, unsigned long align,
-+void *__vmalloc_node_noprof(unsigned long size, unsigned long align,
- 			    gfp_t gfp_mask, int node, const void *caller)
- {
--	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
-+	return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
- 				gfp_mask, PAGE_KERNEL, 0, node, caller);
- }
- /*
-@@ -3337,15 +3337,15 @@ void *__vmalloc_node(unsigned long size, unsigned long align,
-  * than that.
-  */
- #ifdef CONFIG_TEST_VMALLOC_MODULE
--EXPORT_SYMBOL_GPL(__vmalloc_node);
-+EXPORT_SYMBOL_GPL(__vmalloc_node_noprof);
- #endif
- 
--void *__vmalloc(unsigned long size, gfp_t gfp_mask)
-+void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
- {
--	return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
-+	return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE,
- 				__builtin_return_address(0));
- }
--EXPORT_SYMBOL(__vmalloc);
-+EXPORT_SYMBOL(__vmalloc_noprof);
- 
- /**
-  * vmalloc - allocate virtually contiguous memory
-@@ -3359,12 +3359,12 @@ EXPORT_SYMBOL(__vmalloc);
-  *
-  * Return: pointer to the allocated memory or %NULL on error
-  */
--void *vmalloc(unsigned long size)
-+void *vmalloc_noprof(unsigned long size)
- {
--	return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
-+	return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE,
- 				__builtin_return_address(0));
- }
--EXPORT_SYMBOL(vmalloc);
-+EXPORT_SYMBOL(vmalloc_noprof);
- 
- /**
-  * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
-@@ -3378,16 +3378,16 @@ EXPORT_SYMBOL(vmalloc);
-  *
-  * Return: pointer to the allocated memory or %NULL on error
-  */
--void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
-+void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
- {
--	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
-+	return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
- 				    gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
- 				    NUMA_NO_NODE, __builtin_return_address(0));
- }
--EXPORT_SYMBOL_GPL(vmalloc_huge);
-+EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);
- 
- /**
-- * vzalloc - allocate virtually contiguous memory with zero fill
-+ * vzalloc_noprof - allocate virtually contiguous memory with zero fill
-  * @size:    allocation size
-  *
-  * Allocate enough pages to cover @size from the page level
-@@ -3399,12 +3399,12 @@ EXPORT_SYMBOL_GPL(vmalloc_huge);
-  *
-  * Return: pointer to the allocated memory or %NULL on error
-  */
--void *vzalloc(unsigned long size)
-+void *vzalloc_noprof(unsigned long size)
- {
--	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
-+	return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
- 				__builtin_return_address(0));
- }
--EXPORT_SYMBOL(vzalloc);
-+EXPORT_SYMBOL(vzalloc_noprof);
- 
- /**
-  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
-@@ -3415,17 +3415,17 @@ EXPORT_SYMBOL(vzalloc);
-  *
-  * Return: pointer to the allocated memory or %NULL on error
-  */
--void *vmalloc_user(unsigned long size)
-+void *vmalloc_user_noprof(unsigned long size)
- {
--	return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
-+	return __vmalloc_node_range_noprof(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
- 				    GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
- 				    VM_USERMAP, NUMA_NO_NODE,
- 				    __builtin_return_address(0));
- }
--EXPORT_SYMBOL(vmalloc_user);
-+EXPORT_SYMBOL(vmalloc_user_noprof);
- 
- /**
-- * vmalloc_node - allocate memory on a specific node
-+ * vmalloc_node_noprof - allocate memory on a specific node
-  * @size:	  allocation size
-  * @node:	  numa node
-  *
-@@ -3437,15 +3437,15 @@ EXPORT_SYMBOL(vmalloc_user);
-  *
-  * Return: pointer to the allocated memory or %NULL on error
-  */
--void *vmalloc_node(unsigned long size, int node)
-+void *vmalloc_node_noprof(unsigned long size, int node)
- {
--	return __vmalloc_node(size, 1, GFP_KERNEL, node,
-+	return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node,
- 			__builtin_return_address(0));
- }
--EXPORT_SYMBOL(vmalloc_node);
-+EXPORT_SYMBOL(vmalloc_node_noprof);
- 
- /**
-- * vzalloc_node - allocate memory on a specific node with zero fill
-+ * vzalloc_node_noprof - allocate memory on a specific node with zero fill
-  * @size:	allocation size
-  * @node:	numa node
-  *
-@@ -3455,12 +3455,12 @@ EXPORT_SYMBOL(vmalloc_node);
-  *
-  * Return: pointer to the allocated memory or %NULL on error
-  */
--void *vzalloc_node(unsigned long size, int node)
-+void *vzalloc_node_noprof(unsigned long size, int node)
- {
--	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
-+	return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node,
- 				__builtin_return_address(0));
- }
--EXPORT_SYMBOL(vzalloc_node);
-+EXPORT_SYMBOL(vzalloc_node_noprof);
- 
- #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
- #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
-@@ -3475,7 +3475,7 @@ EXPORT_SYMBOL(vzalloc_node);
- #endif
- 
- /**
-- * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
-+ * vmalloc_32_noprof - allocate virtually contiguous memory (32bit addressable)
-  * @size:	allocation size
-  *
-  * Allocate enough 32bit PA addressable pages to cover @size from the
-@@ -3483,15 +3483,15 @@ EXPORT_SYMBOL(vzalloc_node);
-  *
-  * Return: pointer to the allocated memory or %NULL on error
-  */
--void *vmalloc_32(unsigned long size)
-+void *vmalloc_32_noprof(unsigned long size)
- {
--	return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
-+	return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
- 			__builtin_return_address(0));
- }
--EXPORT_SYMBOL(vmalloc_32);
-+EXPORT_SYMBOL(vmalloc_32_noprof);
- 
- /**
-- * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
-+ * vmalloc_32_user_noprof - allocate zeroed virtually contiguous 32bit memory
-  * @size:	     allocation size
-  *
-  * The resulting memory area is 32bit addressable and zeroed so it can be
-@@ -3499,14 +3499,14 @@ EXPORT_SYMBOL(vmalloc_32);
-  *
-  * Return: pointer to the allocated memory or %NULL on error
-  */
--void *vmalloc_32_user(unsigned long size)
-+void *vmalloc_32_user_noprof(unsigned long size)
- {
--	return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
-+	return __vmalloc_node_range_noprof(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
- 				    GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
- 				    VM_USERMAP, NUMA_NO_NODE,
- 				    __builtin_return_address(0));
- }
--EXPORT_SYMBOL(vmalloc_32_user);
-+EXPORT_SYMBOL(vmalloc_32_user_noprof);
- 
- /*
-  * Atomically zero bytes in the iterator.
 diff --git a/mm/vmscan.c b/mm/vmscan.c
-index d6802821d..a22f36ec7 100644
+index 445ce9324..19067fa9a 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
-@@ -58,6 +58,7 @@
+@@ -57,6 +57,7 @@
+ #include <linux/khugepaged.h>
  #include <linux/rculist_nulls.h>
  #include <linux/random.h>
- #include <linux/mmu_notifier.h>
 +#include <linux/seq_buf.h>
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
-@@ -698,7 +699,6 @@ static int __prealloc_shrinker(struct shrinker *shrinker)
+@@ -702,7 +703,6 @@ static int __prealloc_shrinker(struct shrinker *shrinker)
  	return 0;
  }
  
@@ -103158,7 +98729,7 @@ index d6802821d..a22f36ec7 100644
  int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
  {
  	va_list ap;
-@@ -718,19 +718,12 @@ int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+@@ -722,19 +722,12 @@ int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
  
  	return err;
  }
@@ -103179,7 +98750,7 @@ index d6802821d..a22f36ec7 100644
  	if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
  		down_write(&shrinker_rwsem);
  		unregister_memcg_shrinker(shrinker);
-@@ -761,7 +754,6 @@ static int __register_shrinker(struct shrinker *shrinker)
+@@ -765,7 +758,6 @@ static int __register_shrinker(struct shrinker *shrinker)
  	return 0;
  }
  
@@ -103187,7 +98758,7 @@ index d6802821d..a22f36ec7 100644
  int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
  {
  	va_list ap;
-@@ -780,12 +772,6 @@ int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+@@ -784,12 +776,6 @@ int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
  	}
  	return err;
  }
@@ -103200,7 +98771,7 @@ index d6802821d..a22f36ec7 100644
  EXPORT_SYMBOL(register_shrinker);
  
  /*
-@@ -811,6 +797,9 @@ void unregister_shrinker(struct shrinker *shrinker)
+@@ -815,6 +801,9 @@ void unregister_shrinker(struct shrinker *shrinker)
  
  	kfree(shrinker->nr_deferred);
  	shrinker->nr_deferred = NULL;
@@ -103210,7 +98781,7 @@ index d6802821d..a22f36ec7 100644
  }
  EXPORT_SYMBOL(unregister_shrinker);
  
-@@ -829,6 +818,80 @@ void synchronize_shrinkers(void)
+@@ -833,6 +822,80 @@ void synchronize_shrinkers(void)
  }
  EXPORT_SYMBOL(synchronize_shrinkers);
  
@@ -103291,7 +98862,7 @@ index d6802821d..a22f36ec7 100644
  #define SHRINK_BATCH 128
  
  static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
-@@ -895,12 +958,16 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
+@@ -899,12 +962,16 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
  		unsigned long ret;
  		unsigned long nr_to_scan = min(batch_size, total_scan);
  
@@ -103327,7 +98898,7 @@ index 7778cc97a..5341736f2 100644
 +# eval_vars(X_,a/b/c) = $(X_a_b_c) $(X_a_b) $(X_a)
 +eval_vars = $(foreach var,$(call flatten_dirs,$(2)),$($(1)$(var)))
 diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
-index 100a386fc..1f106c71e 100644
+index 68d0134bd..48ded392d 100644
 --- a/scripts/Makefile.lib
 +++ b/scripts/Makefile.lib
 @@ -148,7 +148,7 @@ _cpp_flags     = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(target-stem).lds)
@@ -103340,10 +98911,10 @@ index 100a386fc..1f106c71e 100644
  endif
  
 diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
-index 0d2db4117..7b7dbeb5b 100644
+index 653b92f6d..47978efe4 100644
 --- a/scripts/kallsyms.c
 +++ b/scripts/kallsyms.c
-@@ -203,6 +203,11 @@ static int symbol_in_range(const struct sym_entry *s,
+@@ -204,6 +204,11 @@ static int symbol_in_range(const struct sym_entry *s,
  	return 0;
  }
  
@@ -103355,7 +98926,7 @@ index 0d2db4117..7b7dbeb5b 100644
  static int symbol_valid(const struct sym_entry *s)
  {
  	const char *name = sym_name(s);
-@@ -210,6 +215,14 @@ static int symbol_valid(const struct sym_entry *s)
+@@ -211,6 +216,14 @@ static int symbol_valid(const struct sym_entry *s)
  	/* if --all-symbols is not specified, then symbols outside the text
  	 * and inittext sections are discarded */
  	if (!all_symbols) {
@@ -103370,36 +98941,5 @@ index 0d2db4117..7b7dbeb5b 100644
  		if (symbol_in_range(s, text_ranges,
  				    ARRAY_SIZE(text_ranges)) == 0)
  			return 0;
-diff --git a/scripts/module.lds.S b/scripts/module.lds.S
-index bf5bcf283..45c67a099 100644
---- a/scripts/module.lds.S
-+++ b/scripts/module.lds.S
-@@ -9,6 +9,8 @@
- #define DISCARD_EH_FRAME	*(.eh_frame)
- #endif
- 
-+#include <asm-generic/codetag.lds.h>
-+
- SECTIONS {
- 	/DISCARD/ : {
- 		*(.discard)
-@@ -47,12 +49,17 @@ SECTIONS {
- 	.data : {
- 		*(.data .data.[0-9a-zA-Z_]*)
- 		*(.data..L*)
-+		CODETAG_SECTIONS()
- 	}
- 
- 	.rodata : {
- 		*(.rodata .rodata.[0-9a-zA-Z_]*)
- 		*(.rodata..L*)
- 	}
-+#else
-+	.data : {
-+		CODETAG_SECTIONS()
-+	}
- #endif
- }
- 
 -- 
-2.41.0.159.g0bfa463d37
+2.42.0
diff --git a/scripts/source.sh b/scripts/source.sh
index 336ce64..ec6b297 100755
--- a/scripts/source.sh
+++ b/scripts/source.sh
@@ -2,7 +2,7 @@
 
 echo "Pika Kernel - Getting source"
 
-wget -nv https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/snapshot/linux-6.5-rc7.tar.gz
-tar -xf ./linux-6.5-rc7.tar.gz
+wget -nv https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.5.tar.gz
+tar -xf ./linux-6.5.tar.gz
 
-cd linux-6.5-rc7
+cd linux-6.5