diff --git a/config b/config index 6c60e62..247adc5 100644 --- a/config +++ b/config @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 6.5.0-rc7 Kernel Configuration +# Linux/x86 6.5.0 Kernel Configuration # CONFIG_CC_VERSION_TEXT="gcc (GCC) 13.2.1 20230730" CONFIG_CC_IS_GCC=y @@ -2894,7 +2894,6 @@ CONFIG_SCSI_DH_RDAC=m CONFIG_SCSI_DH_HP_SW=m CONFIG_SCSI_DH_EMC=m CONFIG_SCSI_DH_ALUA=m -CONFIG_VHBA=m # end of SCSI device support CONFIG_ATA=y diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch index 2903879..9565648 100644 --- a/patches/0001-cachy-all.patch +++ b/patches/0001-cachy-all.patch @@ -1,13 +1,2145 @@ -From a7ef8b1848b3d53522882d36ef91ba3a6fcc619c Mon Sep 17 00:00:00 2001 +From de38719bf3e0937c83054c911c5cf102eae632dd Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 20 Aug 2023 15:52:45 +0200 -Subject: [PATCH 1/6] amd-pref-core +Date: Mon, 28 Aug 2023 14:01:05 +0200 +Subject: [PATCH 1/7] amd-hdr + +Signed-off-by: Peter Jung +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h | 71 ++ + .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 34 +- + .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 100 +++ + .../amd/display/amdgpu_dm/amdgpu_dm_color.c | 805 ++++++++++++++++-- + .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c | 72 ++ + .../amd/display/amdgpu_dm/amdgpu_dm_plane.c | 224 ++++- + .../amd/display/dc/dcn10/dcn10_cm_common.c | 95 ++- + .../drm/amd/display/dc/dcn30/dcn30_hwseq.c | 37 + + .../drm/amd/display/dc/dcn30/dcn30_hwseq.h | 3 + + .../drm/amd/display/dc/dcn301/dcn301_init.c | 2 +- + .../gpu/drm/amd/display/include/fixed31_32.h | 12 + + drivers/gpu/drm/arm/malidp_crtc.c | 2 +- + drivers/gpu/drm/drm_atomic.c | 1 + + drivers/gpu/drm/drm_atomic_state_helper.c | 1 + + drivers/gpu/drm/drm_atomic_uapi.c | 43 +- + drivers/gpu/drm/drm_property.c | 49 ++ + include/drm/drm_mode_object.h | 2 +- + include/drm/drm_plane.h | 7 + + include/drm/drm_property.h | 6 + + include/uapi/drm/drm_mode.h | 8 + + 20 files changed, 1446 insertions(+), 128 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h +index 32fe05c810c6..84bf501b02f4 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h +@@ -343,6 +343,77 @@ struct amdgpu_mode_info { + int disp_priority; + const struct amdgpu_display_funcs *funcs; + const enum drm_plane_type *plane_type; ++ ++ /* Driver-private color mgmt props */ ++ ++ /* @plane_degamma_lut_property: Plane property to set a degamma LUT to ++ * convert input space before blending. ++ */ ++ struct drm_property *plane_degamma_lut_property; ++ /* @plane_degamma_lut_size_property: Plane property to define the max ++ * size of degamma LUT as supported by the driver (read-only). ++ */ ++ struct drm_property *plane_degamma_lut_size_property; ++ /** ++ * @plane_degamma_tf_property: Plane pre-defined transfer function to ++ * to go from scanout/encoded values to linear values. ++ */ ++ struct drm_property *plane_degamma_tf_property; ++ /** ++ * @plane_hdr_mult_property: ++ */ ++ struct drm_property *plane_hdr_mult_property; ++ ++ struct drm_property *plane_ctm_property; ++ /** ++ * @shaper_lut_property: Plane property to set pre-blending shaper LUT ++ * that converts color content before 3D LUT. ++ */ ++ struct drm_property *plane_shaper_lut_property; ++ /** ++ * @shaper_lut_size_property: Plane property for the size of ++ * pre-blending shaper LUT as supported by the driver (read-only). ++ */ ++ struct drm_property *plane_shaper_lut_size_property; ++ /** ++ * @plane_shaper_tf_property: Plane property to set a predefined ++ * transfer function for pre-blending shaper (before applying 3D LUT) ++ * with or without LUT. ++ */ ++ struct drm_property *plane_shaper_tf_property; ++ /** ++ * @plane_lut3d_property: Plane property for gamma correction using a ++ * 3D LUT (pre-blending). ++ */ ++ struct drm_property *plane_lut3d_property; ++ /** ++ * @plane_degamma_lut_size_property: Plane property to define the max ++ * size of 3D LUT as supported by the driver (read-only). ++ */ ++ struct drm_property *plane_lut3d_size_property; ++ /** ++ * @plane_blend_lut_property: Plane property for output gamma before ++ * blending. Userspace set a blend LUT to convert colors after 3D LUT ++ * conversion. It works as a post-3D LUT 1D LUT, with shaper LUT, they ++ * are sandwiching 3D LUT with two 1D LUT. ++ */ ++ struct drm_property *plane_blend_lut_property; ++ /** ++ * @plane_blend_lut_size_property: Plane property to define the max ++ * size of blend LUT as supported by the driver (read-only). ++ */ ++ struct drm_property *plane_blend_lut_size_property; ++ /** ++ * @plane_blend_tf_property: Plane property to set a predefined ++ * transfer function for pre-blending blend (before applying 3D LUT) ++ * with or without LUT. ++ */ ++ struct drm_property *plane_blend_tf_property; ++ /* @regamma_tf_property: Transfer function for CRTC regamma ++ * (post-blending). Possible values are defined by `enum ++ * amdgpu_transfer_function`. ++ */ ++ struct drm_property *regamma_tf_property; + }; + + #define AMDGPU_MAX_BL_LEVEL 0xFF +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +index e5554a36e8c8..43ef0e5f97ae 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +@@ -3943,6 +3943,11 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev) + return r; + } + ++#ifdef AMD_PRIVATE_COLOR ++ if (amdgpu_dm_create_color_properties(adev)) ++ return -ENOMEM; ++#endif ++ + r = amdgpu_dm_audio_init(adev); + if (r) { + dc_release_state(state->context); +@@ -4992,7 +4997,9 @@ static int fill_dc_plane_attributes(struct amdgpu_device *adev, + * Always set input transfer function, since plane state is refreshed + * every time. + */ +- ret = amdgpu_dm_update_plane_color_mgmt(dm_crtc_state, dc_plane_state); ++ ret = amdgpu_dm_update_plane_color_mgmt(dm_crtc_state, ++ plane_state, ++ dc_plane_state); + if (ret) + return ret; + +@@ -8007,6 +8014,10 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, + bundle->surface_updates[planes_count].gamma = dc_plane->gamma_correction; + bundle->surface_updates[planes_count].in_transfer_func = dc_plane->in_transfer_func; + bundle->surface_updates[planes_count].gamut_remap_matrix = &dc_plane->gamut_remap_matrix; ++ bundle->surface_updates[planes_count].hdr_mult = dc_plane->hdr_mult; ++ bundle->surface_updates[planes_count].func_shaper = dc_plane->in_shaper_func; ++ bundle->surface_updates[planes_count].lut3d_func = dc_plane->lut3d_func; ++ bundle->surface_updates[planes_count].blend_tf = dc_plane->blend_tf; + } + + amdgpu_dm_plane_fill_dc_scaling_info(dm->adev, new_plane_state, +@@ -8215,6 +8226,10 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, + &acrtc_state->stream->csc_color_matrix; + bundle->stream_update.out_transfer_func = + acrtc_state->stream->out_transfer_func; ++ bundle->stream_update.lut3d_func = ++ (struct dc_3dlut *) acrtc_state->stream->lut3d_func; ++ bundle->stream_update.func_shaper = ++ (struct dc_transfer_func *) acrtc_state->stream->func_shaper; + } + + acrtc_state->stream->abm_level = acrtc_state->abm_level; +@@ -9405,6 +9420,7 @@ static int dm_update_crtc_state(struct amdgpu_display_manager *dm, + * when a modeset is needed, to ensure it gets reprogrammed. + */ + if (dm_new_crtc_state->base.color_mgmt_changed || ++ dm_old_crtc_state->regamma_tf != dm_new_crtc_state->regamma_tf || + drm_atomic_crtc_needs_modeset(new_crtc_state)) { + ret = amdgpu_dm_update_crtc_color_mgmt(dm_new_crtc_state); + if (ret) +@@ -9472,6 +9488,10 @@ static bool should_reset_plane(struct drm_atomic_state *state, + */ + for_each_oldnew_plane_in_state(state, other, old_other_state, new_other_state, i) { + struct amdgpu_framebuffer *old_afb, *new_afb; ++ struct dm_plane_state *dm_new_other_state, *dm_old_other_state; ++ ++ dm_new_other_state = to_dm_plane_state(new_other_state); ++ dm_old_other_state = to_dm_plane_state(old_other_state); + + if (other->type == DRM_PLANE_TYPE_CURSOR) + continue; +@@ -9508,6 +9528,18 @@ static bool should_reset_plane(struct drm_atomic_state *state, + old_other_state->color_encoding != new_other_state->color_encoding) + return true; + ++ /* HDR/Transfer Function changes. */ ++ if (dm_old_other_state->degamma_tf != dm_new_other_state->degamma_tf || ++ dm_old_other_state->degamma_lut != dm_new_other_state->degamma_lut || ++ dm_old_other_state->hdr_mult != dm_new_other_state->hdr_mult || ++ dm_old_other_state->ctm != dm_new_other_state->ctm || ++ dm_old_other_state->shaper_lut != dm_new_other_state->shaper_lut || ++ dm_old_other_state->shaper_tf != dm_new_other_state->shaper_tf || ++ dm_old_other_state->lut3d != dm_new_other_state->lut3d || ++ dm_old_other_state->blend_lut != dm_new_other_state->blend_lut || ++ dm_old_other_state->blend_tf != dm_new_other_state->blend_tf) ++ return true; ++ + /* Framebuffer checks fall at the end. */ + if (!old_other_state->fb || !new_other_state->fb) + continue; +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +index 9fb5bb3a75a7..f92bbd7ed867 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +@@ -51,6 +51,8 @@ + + #define AMDGPU_DMUB_NOTIFICATION_MAX 5 + ++#define AMDGPU_HDR_MULT_DEFAULT (0x100000000LL) ++ + /* + #include "include/amdgpu_dal_power_if.h" + #include "amdgpu_dm_irq.h" +@@ -702,9 +704,91 @@ static inline void amdgpu_dm_set_mst_status(uint8_t *status, + + extern const struct amdgpu_ip_block_version dm_ip_block; + ++enum amdgpu_transfer_function { ++ AMDGPU_TRANSFER_FUNCTION_DEFAULT, ++ AMDGPU_TRANSFER_FUNCTION_SRGB_EOTF, ++ AMDGPU_TRANSFER_FUNCTION_BT709_EOTF, ++ AMDGPU_TRANSFER_FUNCTION_PQ_EOTF, ++ AMDGPU_TRANSFER_FUNCTION_LINEAR, ++ AMDGPU_TRANSFER_FUNCTION_UNITY, ++ AMDGPU_TRANSFER_FUNCTION_GAMMA22_EOTF, ++ AMDGPU_TRANSFER_FUNCTION_GAMMA24_EOTF, ++ AMDGPU_TRANSFER_FUNCTION_GAMMA26_EOTF, ++ AMDGPU_TRANSFER_FUNCTION_SRGB_INV_EOTF, ++ AMDGPU_TRANSFER_FUNCTION_BT709_INV_EOTF, ++ AMDGPU_TRANSFER_FUNCTION_PQ_INV_EOTF, ++ AMDGPU_TRANSFER_FUNCTION_GAMMA22_INV_EOTF, ++ AMDGPU_TRANSFER_FUNCTION_GAMMA24_INV_EOTF, ++ AMDGPU_TRANSFER_FUNCTION_GAMMA26_INV_EOTF, ++ AMDGPU_TRANSFER_FUNCTION_COUNT ++}; ++ + struct dm_plane_state { + struct drm_plane_state base; + struct dc_plane_state *dc_state; ++ ++ /* Plane color mgmt */ ++ /** ++ * @degamma_lut: ++ * ++ * 1D LUT for mapping framebuffer/plane pixel data before sampling or ++ * blending operations. It's usually applied to linearize input space. ++ * The blob (if not NULL) is an array of &struct drm_color_lut. ++ */ ++ struct drm_property_blob *degamma_lut; ++ /** ++ * @degamma_tf: ++ * ++ * Predefined transfer function to tell DC driver the input space to ++ * linearize. ++ */ ++ enum amdgpu_transfer_function degamma_tf; ++ /** ++ * @hdr_mult: ++ * ++ * Multiplier to 'gain' the plane. When PQ is decoded using the fixed ++ * func transfer function to the internal FP16 fb, 1.0 -> 80 nits (on ++ * AMD at least). When sRGB is decoded, 1.0 -> 1.0, obviously. ++ * Therefore, 1.0 multiplier = 80 nits for SDR content. So if you ++ * want, 203 nits for SDR content, pass in (203.0 / 80.0). Format is ++ * S31.32 sign-magnitude. ++ */ ++ __u64 hdr_mult; ++ /** ++ * @ctm: ++ * ++ * Color transformation matrix. See drm_crtc_enable_color_mgmt(). The ++ * blob (if not NULL) is a &struct drm_color_ctm. ++ */ ++ struct drm_property_blob *ctm; ++ /** ++ * @shaper_lut: shaper lookup table blob. The blob (if not NULL) is an ++ * array of &struct drm_color_lut. ++ */ ++ struct drm_property_blob *shaper_lut; ++ /** ++ * @shaper_tf: ++ * ++ * Predefined transfer function to delinearize color space. ++ */ ++ enum amdgpu_transfer_function shaper_tf; ++ /** ++ * @lut3d: 3D lookup table blob. The blob (if not NULL) is an array of ++ * &struct drm_color_lut. ++ */ ++ struct drm_property_blob *lut3d; ++ /** ++ * @blend_lut: blend lut lookup table blob. The blob (if not NULL) is an ++ * array of &struct drm_color_lut. ++ */ ++ struct drm_property_blob *blend_lut; ++ /** ++ * @blend_tf: ++ * ++ * Pre-defined transfer function for converting plane pixel data before ++ * applying blend LUT. ++ */ ++ enum amdgpu_transfer_function blend_tf; + }; + + struct dm_crtc_state { +@@ -729,6 +813,14 @@ struct dm_crtc_state { + struct dc_info_packet vrr_infopacket; + + int abm_level; ++ ++ /** ++ * @regamma_tf: ++ * ++ * Pre-defined transfer function for converting internal FB -> wire ++ * encoding. ++ */ ++ enum amdgpu_transfer_function regamma_tf; + }; + + #define to_dm_crtc_state(x) container_of(x, struct dm_crtc_state, base) +@@ -790,14 +882,22 @@ void amdgpu_dm_update_freesync_caps(struct drm_connector *connector, + + void amdgpu_dm_trigger_timing_sync(struct drm_device *dev); + ++/* 3D LUT max size is 17x17x17 */ ++#define MAX_COLOR_3DLUT_ENTRIES 4913 ++#define MAX_COLOR_3DLUT_BITDEPTH 12 ++int amdgpu_dm_verify_lut3d_size(struct amdgpu_device *adev, ++ struct drm_plane_state *plane_state); ++/* 1D LUT size */ + #define MAX_COLOR_LUT_ENTRIES 4096 + /* Legacy gamm LUT users such as X doesn't like large LUT sizes */ + #define MAX_COLOR_LEGACY_LUT_ENTRIES 256 + + void amdgpu_dm_init_color_mod(void); ++int amdgpu_dm_create_color_properties(struct amdgpu_device *adev); + int amdgpu_dm_verify_lut_sizes(const struct drm_crtc_state *crtc_state); + int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc); + int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc, ++ struct drm_plane_state *plane_state, + struct dc_plane_state *dc_plane_state); + + void amdgpu_dm_update_connector_after_detect( +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c +index a4cb23d059bd..0a51af44efd5 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c +@@ -72,6 +72,7 @@ + */ + + #define MAX_DRM_LUT_VALUE 0xFFFF ++#define SDR_WHITE_LEVEL_INIT_VALUE 80 + + /** + * amdgpu_dm_init_color_mod - Initialize the color module. +@@ -84,6 +85,213 @@ void amdgpu_dm_init_color_mod(void) + setup_x_points_distribution(); + } + ++#ifdef AMD_PRIVATE_COLOR ++/* Pre-defined Transfer Functions (TF) ++ * ++ * AMD driver supports pre-defined mathematical functions for transferring ++ * between encoded values and optical/linear space. Depending on HW color caps, ++ * ROMs and curves built by the AMD color module support these transforms. ++ * ++ * The driver-specific color implementation exposes properties for pre-blending ++ * degamma TF, shaper TF (before 3D LUT), and blend(dpp.ogam) TF and ++ * post-blending regamma (mpc.ogam) TF. However, only pre-blending degamma ++ * supports ROM curves. AMD color module uses pre-defined coefficients to build ++ * curves for the other blocks. What can be done by each color block is ++ * described by struct dpp_color_capsand struct mpc_color_caps. ++ * ++ * AMD driver-specific color API exposes the following pre-defined transfer ++ * functions: ++ * ++ * - Linear/Unity: linear/identity relationship between pixel value and ++ * luminance value; ++ * - Gamma 2.2, Gamma 2.4, Gamma 2.6: pure gamma functions; ++ * - sRGB: 2.4 gamma with small initial linear section as standardized by IEC ++ * 61966-2-1:1999; ++ * - BT.709 (BT.1886): 2.4 gamma with differences in the dark end of the scale. ++ * Used in HD-TV and standardized by ITU-R BT.1886; ++ * - PQ (Perceptual Quantizer): used for HDR display, allows luminance range ++ * capability of 0 to 10,000 nits; standardized by SMPTE ST 2084. ++ * ++ * In the driver-specific API, color block names attached to TF properties ++ * suggest the intention regarding non-linear encoding pixel's luminance ++ * values. As some newer encodings don't use gamma curve, we make encoding and ++ * decoding explicit by defining an enum list of transfer functions supported ++ * in terms of EOTF and inverse EOTF, where: ++ * ++ * - EOTF (electro-optical transfer function): is the transfer function to go ++ * from the encoded value to an optical (linear) value. De-gamma functions ++ * traditionally do this. ++ * - Inverse EOTF (simply the inverse of the EOTF): is usually intended to go ++ * from an optical/linear space (which might have been used for blending) ++ * back to the encoded values. Gamma functions traditionally do this. ++ */ ++static const char * const ++amdgpu_transfer_function_names[] = { ++ [AMDGPU_TRANSFER_FUNCTION_DEFAULT] = "Default", ++ [AMDGPU_TRANSFER_FUNCTION_LINEAR] = "Linear", ++ [AMDGPU_TRANSFER_FUNCTION_UNITY] = "Unity", ++ [AMDGPU_TRANSFER_FUNCTION_SRGB_EOTF] = "sRGB EOTF", ++ [AMDGPU_TRANSFER_FUNCTION_BT709_EOTF] = "BT.709 EOTF", ++ [AMDGPU_TRANSFER_FUNCTION_PQ_EOTF] = "PQ EOTF", ++ [AMDGPU_TRANSFER_FUNCTION_GAMMA22_EOTF] = "Gamma 2.2 EOTF", ++ [AMDGPU_TRANSFER_FUNCTION_GAMMA24_EOTF] = "Gamma 2.4 EOTF", ++ [AMDGPU_TRANSFER_FUNCTION_GAMMA26_EOTF] = "Gamma 2.6 EOTF", ++ [AMDGPU_TRANSFER_FUNCTION_SRGB_INV_EOTF] = "sRGB inv_EOTF", ++ [AMDGPU_TRANSFER_FUNCTION_BT709_INV_EOTF] = "BT.709 inv_EOTF", ++ [AMDGPU_TRANSFER_FUNCTION_PQ_INV_EOTF] = "PQ inv_EOTF", ++ [AMDGPU_TRANSFER_FUNCTION_GAMMA22_INV_EOTF] = "Gamma 2.2 inv_EOTF", ++ [AMDGPU_TRANSFER_FUNCTION_GAMMA24_INV_EOTF] = "Gamma 2.4 inv_EOTF", ++ [AMDGPU_TRANSFER_FUNCTION_GAMMA26_INV_EOTF] = "Gamma 2.6 inv_EOTF", ++}; ++ ++static const u32 amdgpu_eotf = ++ BIT(AMDGPU_TRANSFER_FUNCTION_SRGB_EOTF) | ++ BIT(AMDGPU_TRANSFER_FUNCTION_BT709_EOTF) | ++ BIT(AMDGPU_TRANSFER_FUNCTION_PQ_EOTF) | ++ BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA22_EOTF) | ++ BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA24_EOTF) | ++ BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA26_EOTF); ++ ++static const u32 amdgpu_inv_eotf = ++ BIT(AMDGPU_TRANSFER_FUNCTION_SRGB_INV_EOTF) | ++ BIT(AMDGPU_TRANSFER_FUNCTION_BT709_INV_EOTF) | ++ BIT(AMDGPU_TRANSFER_FUNCTION_PQ_INV_EOTF) | ++ BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA22_INV_EOTF) | ++ BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA24_INV_EOTF) | ++ BIT(AMDGPU_TRANSFER_FUNCTION_GAMMA26_INV_EOTF); ++ ++static struct drm_property * ++amdgpu_create_tf_property(struct drm_device *dev, ++ const char *name, ++ u32 supported_tf) ++{ ++ u32 transfer_functions = supported_tf | ++ BIT(AMDGPU_TRANSFER_FUNCTION_DEFAULT) | ++ BIT(AMDGPU_TRANSFER_FUNCTION_LINEAR) | ++ BIT(AMDGPU_TRANSFER_FUNCTION_UNITY); ++ struct drm_prop_enum_list enum_list[AMDGPU_TRANSFER_FUNCTION_COUNT]; ++ int i, len; ++ ++ len = 0; ++ for (i = 0; i < AMDGPU_TRANSFER_FUNCTION_COUNT; i++) { ++ if ((transfer_functions & BIT(i)) == 0) ++ continue; ++ ++ enum_list[len].type = i; ++ enum_list[len].name = amdgpu_transfer_function_names[i]; ++ len++; ++ } ++ ++ return drm_property_create_enum(dev, DRM_MODE_PROP_ENUM, ++ name, enum_list, len); ++} ++ ++int ++amdgpu_dm_create_color_properties(struct amdgpu_device *adev) ++{ ++ struct drm_property *prop; ++ ++ prop = drm_property_create(adev_to_drm(adev), ++ DRM_MODE_PROP_BLOB, ++ "AMD_PLANE_DEGAMMA_LUT", 0); ++ if (!prop) ++ return -ENOMEM; ++ adev->mode_info.plane_degamma_lut_property = prop; ++ ++ prop = drm_property_create_range(adev_to_drm(adev), ++ DRM_MODE_PROP_IMMUTABLE, ++ "AMD_PLANE_DEGAMMA_LUT_SIZE", 0, UINT_MAX); ++ if (!prop) ++ return -ENOMEM; ++ adev->mode_info.plane_degamma_lut_size_property = prop; ++ ++ prop = amdgpu_create_tf_property(adev_to_drm(adev), ++ "AMD_PLANE_DEGAMMA_TF", ++ amdgpu_eotf); ++ if (!prop) ++ return -ENOMEM; ++ adev->mode_info.plane_degamma_tf_property = prop; ++ ++ prop = drm_property_create_range(adev_to_drm(adev), ++ 0, "AMD_PLANE_HDR_MULT", 0, U64_MAX); ++ if (!prop) ++ return -ENOMEM; ++ adev->mode_info.plane_hdr_mult_property = prop; ++ ++ prop = drm_property_create(adev_to_drm(adev), ++ DRM_MODE_PROP_BLOB, ++ "AMD_PLANE_CTM", 0); ++ if (!prop) ++ return -ENOMEM; ++ adev->mode_info.plane_ctm_property = prop; ++ ++ prop = drm_property_create(adev_to_drm(adev), ++ DRM_MODE_PROP_BLOB, ++ "AMD_PLANE_SHAPER_LUT", 0); ++ if (!prop) ++ return -ENOMEM; ++ adev->mode_info.plane_shaper_lut_property = prop; ++ ++ prop = drm_property_create_range(adev_to_drm(adev), ++ DRM_MODE_PROP_IMMUTABLE, ++ "AMD_PLANE_SHAPER_LUT_SIZE", 0, UINT_MAX); ++ if (!prop) ++ return -ENOMEM; ++ adev->mode_info.plane_shaper_lut_size_property = prop; ++ ++ prop = amdgpu_create_tf_property(adev_to_drm(adev), ++ "AMD_PLANE_SHAPER_TF", ++ amdgpu_inv_eotf); ++ if (!prop) ++ return -ENOMEM; ++ adev->mode_info.plane_shaper_tf_property = prop; ++ ++ prop = drm_property_create(adev_to_drm(adev), ++ DRM_MODE_PROP_BLOB, ++ "AMD_PLANE_LUT3D", 0); ++ if (!prop) ++ return -ENOMEM; ++ adev->mode_info.plane_lut3d_property = prop; ++ ++ prop = drm_property_create_range(adev_to_drm(adev), ++ DRM_MODE_PROP_IMMUTABLE, ++ "AMD_PLANE_LUT3D_SIZE", 0, UINT_MAX); ++ if (!prop) ++ return -ENOMEM; ++ adev->mode_info.plane_lut3d_size_property = prop; ++ ++ prop = drm_property_create(adev_to_drm(adev), ++ DRM_MODE_PROP_BLOB, ++ "AMD_PLANE_BLEND_LUT", 0); ++ if (!prop) ++ return -ENOMEM; ++ adev->mode_info.plane_blend_lut_property = prop; ++ ++ prop = drm_property_create_range(adev_to_drm(adev), ++ DRM_MODE_PROP_IMMUTABLE, ++ "AMD_PLANE_BLEND_LUT_SIZE", 0, UINT_MAX); ++ if (!prop) ++ return -ENOMEM; ++ adev->mode_info.plane_blend_lut_size_property = prop; ++ ++ prop = amdgpu_create_tf_property(adev_to_drm(adev), ++ "AMD_PLANE_BLEND_TF", ++ amdgpu_eotf); ++ if (!prop) ++ return -ENOMEM; ++ adev->mode_info.plane_blend_tf_property = prop; ++ ++ prop = amdgpu_create_tf_property(adev_to_drm(adev), ++ "AMD_CRTC_REGAMMA_TF", ++ amdgpu_inv_eotf); ++ if (!prop) ++ return -ENOMEM; ++ adev->mode_info.regamma_tf_property = prop; ++ ++ return 0; ++} ++#endif ++ + /** + * __extract_blob_lut - Extracts the DRM lut and lut size from a blob. + * @blob: DRM color mgmt property blob +@@ -182,7 +390,6 @@ static void __drm_lut_to_dc_gamma(const struct drm_color_lut *lut, + static void __drm_ctm_to_dc_matrix(const struct drm_color_ctm *ctm, + struct fixed31_32 *matrix) + { +- int64_t val; + int i; + + /* +@@ -201,12 +408,33 @@ static void __drm_ctm_to_dc_matrix(const struct drm_color_ctm *ctm, + } + + /* gamut_remap_matrix[i] = ctm[i - floor(i/4)] */ +- val = ctm->matrix[i - (i / 4)]; +- /* If negative, convert to 2's complement. */ +- if (val & (1ULL << 63)) +- val = -(val & ~(1ULL << 63)); ++ matrix[i] = dc_fixpt_from_s3132(ctm->matrix[i - (i / 4)]); ++ } ++} + +- matrix[i].value = val; ++/** ++ * __drm_ctm2_to_dc_matrix - converts a DRM CTM2 to a DC CSC float matrix ++ * @ctm: DRM color transformation matrix ++ * @matrix: DC CSC float matrix ++ * ++ * The matrix needs to be a 3x4 (12 entry) matrix. ++ */ ++static void __drm_ctm2_to_dc_matrix(const struct drm_color_ctm2 *ctm, ++ struct fixed31_32 *matrix) ++{ ++ int i; ++ ++ /* ++ * DRM gives a 3x3 matrix, but DC wants 3x4. Assuming we're operating ++ * with homogeneous coordinates, augment the matrix with 0's. ++ * ++ * The format provided is S31.32, using signed-magnitude representation. ++ * Our fixed31_32 is also S31.32, but is using 2's complement. We have ++ * to convert from signed-magnitude to 2's complement. ++ */ ++ for (i = 0; i < 12; i++) { ++ /* gamut_remap_matrix[i] = ctm[i - floor(i/4)] */ ++ matrix[i] = dc_fixpt_from_s3132(ctm->matrix[i]); + } + } + +@@ -268,16 +496,18 @@ static int __set_output_tf(struct dc_transfer_func *func, + struct calculate_buffer cal_buffer = {0}; + bool res; + +- ASSERT(lut && lut_size == MAX_COLOR_LUT_ENTRIES); +- + cal_buffer.buffer_index = -1; + +- gamma = dc_create_gamma(); +- if (!gamma) +- return -ENOMEM; ++ if (lut_size) { ++ ASSERT(lut && lut_size == MAX_COLOR_LUT_ENTRIES); + +- gamma->num_entries = lut_size; +- __drm_lut_to_dc_gamma(lut, gamma, false); ++ gamma = dc_create_gamma(); ++ if (!gamma) ++ return -ENOMEM; ++ ++ gamma->num_entries = lut_size; ++ __drm_lut_to_dc_gamma(lut, gamma, false); ++ } + + if (func->tf == TRANSFER_FUNCTION_LINEAR) { + /* +@@ -285,27 +515,63 @@ static int __set_output_tf(struct dc_transfer_func *func, + * on top of a linear input. But degamma params can be used + * instead to simulate this. + */ +- gamma->type = GAMMA_CUSTOM; ++ if (gamma) ++ gamma->type = GAMMA_CUSTOM; + res = mod_color_calculate_degamma_params(NULL, func, +- gamma, true); ++ gamma, gamma != NULL); + } else { + /* + * Assume sRGB. The actual mapping will depend on whether the + * input was legacy or not. + */ +- gamma->type = GAMMA_CS_TFM_1D; +- res = mod_color_calculate_regamma_params(func, gamma, false, ++ if (gamma) ++ gamma->type = GAMMA_CS_TFM_1D; ++ res = mod_color_calculate_regamma_params(func, gamma, gamma != NULL, + has_rom, NULL, &cal_buffer); + } + +- dc_gamma_release(&gamma); ++ if (gamma) ++ dc_gamma_release(&gamma); + + return res ? 0 : -ENOMEM; + } + ++static int amdgpu_dm_set_atomic_regamma(struct dc_stream_state *stream, ++ const struct drm_color_lut *regamma_lut, ++ uint32_t regamma_size, bool has_rom, ++ enum dc_transfer_func_predefined tf) ++{ ++ struct dc_transfer_func *out_tf = stream->out_transfer_func; ++ int ret = 0; ++ ++ if (regamma_size || tf != TRANSFER_FUNCTION_LINEAR) { ++ /* CRTC RGM goes into RGM LUT. ++ * ++ * Note: there is no implicit sRGB regamma here. We are using ++ * degamma calculation from color module to calculate the curve ++ * from a linear base. ++ */ ++ out_tf->type = TF_TYPE_DISTRIBUTED_POINTS; ++ out_tf->tf = tf; ++ out_tf->sdr_ref_white_level = SDR_WHITE_LEVEL_INIT_VALUE; ++ ++ ret = __set_output_tf(out_tf, regamma_lut, regamma_size, has_rom); ++ } else { ++ /* ++ * No CRTC RGM means we can just put the block into bypass ++ * since we don't have any plane level adjustments using it. ++ */ ++ out_tf->type = TF_TYPE_BYPASS; ++ out_tf->tf = TRANSFER_FUNCTION_LINEAR; ++ } ++ ++ return ret; ++} ++ + /** + * __set_input_tf - calculates the input transfer function based on expected + * input space. ++ * @caps: dc color capabilities + * @func: transfer function + * @lut: lookup table that defines the color space + * @lut_size: size of respective lut. +@@ -313,27 +579,249 @@ static int __set_output_tf(struct dc_transfer_func *func, + * Returns: + * 0 in case of success. -ENOMEM if fails. + */ +-static int __set_input_tf(struct dc_transfer_func *func, ++static int __set_input_tf(struct dc_color_caps *caps, struct dc_transfer_func *func, + const struct drm_color_lut *lut, uint32_t lut_size) + { + struct dc_gamma *gamma = NULL; + bool res; + +- gamma = dc_create_gamma(); +- if (!gamma) +- return -ENOMEM; ++ if (lut_size) { ++ gamma = dc_create_gamma(); ++ if (!gamma) ++ return -ENOMEM; + +- gamma->type = GAMMA_CUSTOM; +- gamma->num_entries = lut_size; ++ gamma->type = GAMMA_CUSTOM; ++ gamma->num_entries = lut_size; + +- __drm_lut_to_dc_gamma(lut, gamma, false); ++ __drm_lut_to_dc_gamma(lut, gamma, false); ++ } + +- res = mod_color_calculate_degamma_params(NULL, func, gamma, true); +- dc_gamma_release(&gamma); ++ res = mod_color_calculate_degamma_params(caps, func, gamma, gamma != NULL); ++ ++ if (gamma) ++ dc_gamma_release(&gamma); + + return res ? 0 : -ENOMEM; + } + ++static enum dc_transfer_func_predefined ++amdgpu_tf_to_dc_tf(enum amdgpu_transfer_function tf) ++{ ++ switch (tf) ++ { ++ default: ++ case AMDGPU_TRANSFER_FUNCTION_DEFAULT: ++ case AMDGPU_TRANSFER_FUNCTION_LINEAR: ++ return TRANSFER_FUNCTION_LINEAR; ++ case AMDGPU_TRANSFER_FUNCTION_SRGB_EOTF: ++ case AMDGPU_TRANSFER_FUNCTION_SRGB_INV_EOTF: ++ return TRANSFER_FUNCTION_SRGB; ++ case AMDGPU_TRANSFER_FUNCTION_BT709_EOTF: ++ case AMDGPU_TRANSFER_FUNCTION_BT709_INV_EOTF: ++ return TRANSFER_FUNCTION_BT709; ++ case AMDGPU_TRANSFER_FUNCTION_PQ_EOTF: ++ case AMDGPU_TRANSFER_FUNCTION_PQ_INV_EOTF: ++ return TRANSFER_FUNCTION_PQ; ++ case AMDGPU_TRANSFER_FUNCTION_UNITY: ++ return TRANSFER_FUNCTION_UNITY; ++ case AMDGPU_TRANSFER_FUNCTION_GAMMA22_EOTF: ++ case AMDGPU_TRANSFER_FUNCTION_GAMMA22_INV_EOTF: ++ return TRANSFER_FUNCTION_GAMMA22; ++ case AMDGPU_TRANSFER_FUNCTION_GAMMA24_EOTF: ++ case AMDGPU_TRANSFER_FUNCTION_GAMMA24_INV_EOTF: ++ return TRANSFER_FUNCTION_GAMMA24; ++ case AMDGPU_TRANSFER_FUNCTION_GAMMA26_EOTF: ++ case AMDGPU_TRANSFER_FUNCTION_GAMMA26_INV_EOTF: ++ return TRANSFER_FUNCTION_GAMMA26; ++ } ++} ++ ++static void __to_dc_lut3d_color(struct dc_rgb *rgb, ++ const struct drm_color_lut lut, ++ int bit_precision) ++{ ++ rgb->red = drm_color_lut_extract(lut.red, bit_precision); ++ rgb->green = drm_color_lut_extract(lut.green, bit_precision); ++ rgb->blue = drm_color_lut_extract(lut.blue, bit_precision); ++} ++ ++static void __drm_3dlut_to_dc_3dlut(const struct drm_color_lut *lut, ++ uint32_t lut3d_size, ++ struct tetrahedral_params *params, ++ bool use_tetrahedral_9, ++ int bit_depth) ++{ ++ struct dc_rgb *lut0; ++ struct dc_rgb *lut1; ++ struct dc_rgb *lut2; ++ struct dc_rgb *lut3; ++ int lut_i, i; ++ ++ ++ if (use_tetrahedral_9) { ++ lut0 = params->tetrahedral_9.lut0; ++ lut1 = params->tetrahedral_9.lut1; ++ lut2 = params->tetrahedral_9.lut2; ++ lut3 = params->tetrahedral_9.lut3; ++ } else { ++ lut0 = params->tetrahedral_17.lut0; ++ lut1 = params->tetrahedral_17.lut1; ++ lut2 = params->tetrahedral_17.lut2; ++ lut3 = params->tetrahedral_17.lut3; ++ } ++ ++ for (lut_i = 0, i = 0; i < lut3d_size - 4; lut_i++, i += 4) { ++ /* We should consider the 3dlut RGB values are distributed ++ * along four arrays lut0-3 where the first sizes 1229 and the ++ * other 1228. The bit depth supported for 3dlut channel is ++ * 12-bit, but DC also supports 10-bit. ++ * ++ * TODO: improve color pipeline API to enable the userspace set ++ * bit depth and 3D LUT size/stride, as specified by VA-API. ++ */ ++ __to_dc_lut3d_color(&lut0[lut_i], lut[i], bit_depth); ++ __to_dc_lut3d_color(&lut1[lut_i], lut[i + 1], bit_depth); ++ __to_dc_lut3d_color(&lut2[lut_i], lut[i + 2], bit_depth); ++ __to_dc_lut3d_color(&lut3[lut_i], lut[i + 3], bit_depth); ++ } ++ /* lut0 has 1229 points (lut_size/4 + 1) */ ++ __to_dc_lut3d_color(&lut0[lut_i], lut[i], bit_depth); ++} ++ ++/* amdgpu_dm_atomic_lut3d - set DRM 3D LUT to DC stream ++ * @drm_lut3d: DRM CRTC (user) 3D LUT ++ * @drm_lut3d_size: size of 3D LUT ++ * @lut3d: DC 3D LUT ++ * ++ * Map DRM CRTC 3D LUT to DC 3D LUT and all necessary bits to program it ++ * on DCN MPC accordingly. ++ */ ++static void amdgpu_dm_atomic_lut3d(const struct drm_color_lut *drm_lut, ++ uint32_t drm_lut3d_size, ++ struct dc_3dlut *lut) ++{ ++ if (!drm_lut3d_size) { ++ lut->state.bits.initialized = 0; ++ } else { ++ /* Stride and bit depth are not programmable by API yet. ++ * Therefore, only supports 17x17x17 3D LUT (12-bit). ++ */ ++ lut->lut_3d.use_tetrahedral_9 = false; ++ lut->lut_3d.use_12bits = true; ++ lut->state.bits.initialized = 1; ++ __drm_3dlut_to_dc_3dlut(drm_lut, drm_lut3d_size, &lut->lut_3d, ++ lut->lut_3d.use_tetrahedral_9, ++ MAX_COLOR_3DLUT_BITDEPTH); ++ } ++} ++ ++static int amdgpu_dm_atomic_shaper_lut(const struct drm_color_lut *shaper_lut, ++ bool has_rom, ++ enum dc_transfer_func_predefined tf, ++ uint32_t shaper_size, ++ struct dc_transfer_func *func_shaper) ++{ ++ int ret = 0; ++ ++ if (shaper_size || tf != TRANSFER_FUNCTION_LINEAR) { ++ /* If DRM shaper LUT is set, we assume a linear color space ++ * (linearized by DRM degamma 1D LUT or not) ++ */ ++ func_shaper->type = TF_TYPE_DISTRIBUTED_POINTS; ++ func_shaper->tf = tf; ++ func_shaper->sdr_ref_white_level = SDR_WHITE_LEVEL_INIT_VALUE; ++ ++ ret = __set_output_tf(func_shaper, shaper_lut, shaper_size, has_rom); ++ } else { ++ func_shaper->type = TF_TYPE_BYPASS; ++ func_shaper->tf = TRANSFER_FUNCTION_LINEAR; ++ } ++ ++ return ret; ++} ++ ++static int amdgpu_dm_atomic_blend_lut(const struct drm_color_lut *blend_lut, ++ bool has_rom, ++ enum dc_transfer_func_predefined tf, ++ uint32_t blend_size, ++ struct dc_transfer_func *func_blend) ++{ ++ int ret = 0; ++ ++ if (blend_size || tf != TRANSFER_FUNCTION_LINEAR) { ++ /* DRM plane gamma LUT or TF means we are linearizing color ++ * space before blending (similar to degamma programming). As ++ * we don't have hardcoded curve support, or we use AMD color ++ * module to fill the parameters that will be translated to HW ++ * points. ++ */ ++ func_blend->type = TF_TYPE_DISTRIBUTED_POINTS; ++ func_blend->tf = tf; ++ func_blend->sdr_ref_white_level = SDR_WHITE_LEVEL_INIT_VALUE; ++ ++ ret = __set_input_tf(NULL, func_blend, blend_lut, blend_size); ++ } else { ++ func_blend->type = TF_TYPE_BYPASS; ++ func_blend->tf = TRANSFER_FUNCTION_LINEAR; ++ } ++ ++ return ret; ++} ++ ++/* amdgpu_dm_lut3d_size - get expected size according to hw color caps ++ * @adev: amdgpu device ++ * @lut_size: default size ++ * ++ * Return: ++ * lut_size if DC 3D LUT is supported, zero otherwise. ++ */ ++static uint32_t amdgpu_dm_get_lut3d_size(struct amdgpu_device *adev, ++ uint32_t lut_size) ++{ ++ return adev->dm.dc->caps.color.dpp.hw_3d_lut ? lut_size : 0; ++} ++ ++/** ++ * amdgpu_dm_verify_lut3d_size - verifies if 3D LUT is supported and if DRM 3D ++ * LUT matches the hw supported size ++ * @adev: amdgpu device ++ * @crtc_state: the DRM CRTC state ++ * ++ * Verifies if post-blending (MPC) 3D LUT is supported by the HW (DCN 3.0 or ++ * newer) and if the DRM 3D LUT matches the supported size. ++ * ++ * Returns: ++ * 0 on success. -EINVAL if lut size are invalid. ++ */ ++int amdgpu_dm_verify_lut3d_size(struct amdgpu_device *adev, ++ struct drm_plane_state *plane_state) ++{ ++ struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state); ++ const struct drm_color_lut *shaper = NULL, *lut3d = NULL; ++ uint32_t exp_size, size; ++ ++ /* shaper LUT is only available if 3D LUT color caps*/ ++ exp_size = amdgpu_dm_get_lut3d_size(adev, MAX_COLOR_LUT_ENTRIES); ++ shaper = __extract_blob_lut(dm_plane_state->shaper_lut, &size); ++ ++ if (shaper && size != exp_size) { ++ drm_dbg(&adev->ddev, ++ "Invalid Shaper LUT size. Should be %u but got %u.\n", ++ exp_size, size); ++ } ++ ++ exp_size = amdgpu_dm_get_lut3d_size(adev, MAX_COLOR_3DLUT_ENTRIES); ++ lut3d = __extract_blob_lut(dm_plane_state->lut3d, &size); ++ ++ if (lut3d && size != exp_size) { ++ drm_dbg(&adev->ddev, "Invalid 3D LUT size. Should be %u but got %u.\n", ++ exp_size, size); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ + /** + * amdgpu_dm_verify_lut_sizes - verifies if DRM luts match the hw supported sizes + * @crtc_state: the DRM CRTC state +@@ -401,9 +889,12 @@ int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc) + const struct drm_color_lut *degamma_lut, *regamma_lut; + uint32_t degamma_size, regamma_size; + bool has_regamma, has_degamma; ++ enum dc_transfer_func_predefined tf = TRANSFER_FUNCTION_LINEAR; + bool is_legacy; + int r; + ++ tf = amdgpu_tf_to_dc_tf(crtc->regamma_tf); ++ + r = amdgpu_dm_verify_lut_sizes(&crtc->base); + if (r) + return r; +@@ -440,26 +931,22 @@ int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc) + stream->out_transfer_func->type = TF_TYPE_DISTRIBUTED_POINTS; + stream->out_transfer_func->tf = TRANSFER_FUNCTION_SRGB; + ++ /* Note: although we pass has_rom as parameter here, we never ++ * actually use ROM because the color module only takes the ROM ++ * path if transfer_func->type == PREDEFINED. ++ * ++ * See more in mod_color_calculate_regamma_params() ++ */ + r = __set_legacy_tf(stream->out_transfer_func, regamma_lut, + regamma_size, has_rom); + if (r) + return r; +- } else if (has_regamma) { +- /* If atomic regamma, CRTC RGM goes into RGM LUT. */ +- stream->out_transfer_func->type = TF_TYPE_DISTRIBUTED_POINTS; +- stream->out_transfer_func->tf = TRANSFER_FUNCTION_LINEAR; +- +- r = __set_output_tf(stream->out_transfer_func, regamma_lut, +- regamma_size, has_rom); ++ } else { ++ regamma_size = has_regamma ? regamma_size : 0; ++ r = amdgpu_dm_set_atomic_regamma(stream, regamma_lut, ++ regamma_size, has_rom, tf); + if (r) + return r; +- } else { +- /* +- * No CRTC RGM means we can just put the block into bypass +- * since we don't have any plane level adjustments using it. +- */ +- stream->out_transfer_func->type = TF_TYPE_BYPASS; +- stream->out_transfer_func->tf = TRANSFER_FUNCTION_LINEAR; + } + + /* +@@ -495,20 +982,10 @@ int amdgpu_dm_update_crtc_color_mgmt(struct dm_crtc_state *crtc) + return 0; + } + +-/** +- * amdgpu_dm_update_plane_color_mgmt: Maps DRM color management to DC plane. +- * @crtc: amdgpu_dm crtc state +- * @dc_plane_state: target DC surface +- * +- * Update the underlying dc_stream_state's input transfer function (ITF) in +- * preparation for hardware commit. The transfer function used depends on +- * the preparation done on the stream for color management. +- * +- * Returns: +- * 0 on success. -ENOMEM if mem allocation fails. +- */ +-int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc, +- struct dc_plane_state *dc_plane_state) ++static int ++map_crtc_degamma_to_dc_plane(struct dm_crtc_state *crtc, ++ struct dc_plane_state *dc_plane_state, ++ struct dc_color_caps *caps) + { + const struct drm_color_lut *degamma_lut; + enum dc_transfer_func_predefined tf = TRANSFER_FUNCTION_SRGB; +@@ -531,8 +1008,7 @@ int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc, + °amma_size); + ASSERT(degamma_size == MAX_COLOR_LUT_ENTRIES); + +- dc_plane_state->in_transfer_func->type = +- TF_TYPE_DISTRIBUTED_POINTS; ++ dc_plane_state->in_transfer_func->type = TF_TYPE_DISTRIBUTED_POINTS; + + /* + * This case isn't fully correct, but also fairly +@@ -564,11 +1040,11 @@ int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc, + dc_plane_state->in_transfer_func->tf = + TRANSFER_FUNCTION_LINEAR; + +- r = __set_input_tf(dc_plane_state->in_transfer_func, ++ r = __set_input_tf(caps, dc_plane_state->in_transfer_func, + degamma_lut, degamma_size); + if (r) + return r; +- } else if (crtc->cm_is_degamma_srgb) { ++ } else { + /* + * For legacy gamma support we need the regamma input + * in linear space. Assume that the input is sRGB. +@@ -577,14 +1053,213 @@ int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc, + dc_plane_state->in_transfer_func->tf = tf; + + if (tf != TRANSFER_FUNCTION_SRGB && +- !mod_color_calculate_degamma_params(NULL, +- dc_plane_state->in_transfer_func, NULL, false)) ++ !mod_color_calculate_degamma_params(caps, ++ dc_plane_state->in_transfer_func, ++ NULL, false)) ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static int ++__set_dm_plane_degamma(struct drm_plane_state *plane_state, ++ struct dc_plane_state *dc_plane_state, ++ struct dc_color_caps *color_caps) ++{ ++ struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state); ++ const struct drm_color_lut *degamma_lut; ++ enum amdgpu_transfer_function tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT; ++ uint32_t degamma_size; ++ bool has_degamma_lut; ++ int ret; ++ ++ degamma_lut = __extract_blob_lut(dm_plane_state->degamma_lut, ++ °amma_size); ++ ++ has_degamma_lut = degamma_lut && ++ !__is_lut_linear(degamma_lut, degamma_size); ++ ++ tf = dm_plane_state->degamma_tf; ++ ++ /* If we don't have plane degamma LUT nor TF to set on DC, we have ++ * nothing to do here, return. ++ */ ++ if (!has_degamma_lut && tf == AMDGPU_TRANSFER_FUNCTION_DEFAULT) ++ return -EINVAL; ++ ++ dc_plane_state->in_transfer_func->tf = amdgpu_tf_to_dc_tf(tf); ++ ++ if (has_degamma_lut) { ++ ASSERT(degamma_size == MAX_COLOR_LUT_ENTRIES); ++ ++ dc_plane_state->in_transfer_func->type = ++ TF_TYPE_DISTRIBUTED_POINTS; ++ ++ ret = __set_input_tf(color_caps, dc_plane_state->in_transfer_func, ++ degamma_lut, degamma_size); ++ if (ret) ++ return ret; ++ } else { ++ dc_plane_state->in_transfer_func->type = ++ TF_TYPE_PREDEFINED; ++ ++ if (!mod_color_calculate_degamma_params(color_caps, ++ dc_plane_state->in_transfer_func, NULL, false)) + return -ENOMEM; +- } else { +- /* ...Otherwise we can just bypass the DGM block. */ +- dc_plane_state->in_transfer_func->type = TF_TYPE_BYPASS; +- dc_plane_state->in_transfer_func->tf = TRANSFER_FUNCTION_LINEAR; ++ } ++ return 0; ++} ++ ++static int ++amdgpu_dm_plane_set_color_properties(struct drm_plane_state *plane_state, ++ struct dc_plane_state *dc_plane_state, ++ struct dc_color_caps *color_caps) ++{ ++ struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state); ++ enum amdgpu_transfer_function shaper_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT; ++ enum amdgpu_transfer_function blend_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT; ++ const struct drm_color_lut *shaper_lut, *lut3d, *blend_lut; ++ uint32_t shaper_size, lut3d_size, blend_size; ++ int ret; ++ ++ /* We have nothing to do here, return */ ++ if (!plane_state->color_mgmt_changed) ++ return 0; ++ ++ dc_plane_state->hdr_mult = dc_fixpt_from_s3132(dm_plane_state->hdr_mult); ++ ++ shaper_lut = __extract_blob_lut(dm_plane_state->shaper_lut, &shaper_size); ++ shaper_size = shaper_lut != NULL ? shaper_size : 0; ++ shaper_tf = dm_plane_state->shaper_tf; ++ lut3d = __extract_blob_lut(dm_plane_state->lut3d, &lut3d_size); ++ lut3d_size = lut3d != NULL ? lut3d_size : 0; ++ ++ amdgpu_dm_atomic_lut3d(lut3d, lut3d_size, dc_plane_state->lut3d_func); ++ ret = amdgpu_dm_atomic_shaper_lut(shaper_lut, false, ++ amdgpu_tf_to_dc_tf(shaper_tf), ++ shaper_size, ++ dc_plane_state->in_shaper_func); ++ if (ret) { ++ drm_dbg_kms(plane_state->plane->dev, ++ "setting plane %d shaper LUT failed.\n", ++ plane_state->plane->index); ++ ++ return ret; ++ } ++ ++ blend_tf = dm_plane_state->blend_tf; ++ blend_lut = __extract_blob_lut(dm_plane_state->blend_lut, &blend_size); ++ blend_size = blend_lut != NULL ? blend_size : 0; ++ ++ ret = amdgpu_dm_atomic_blend_lut(blend_lut, false, ++ amdgpu_tf_to_dc_tf(blend_tf), ++ blend_size, dc_plane_state->blend_tf); ++ if (ret) { ++ drm_dbg_kms(plane_state->plane->dev, ++ "setting plane %d gamma lut failed.\n", ++ plane_state->plane->index); ++ ++ return ret; + } + + return 0; + } ++ ++/** ++ * amdgpu_dm_update_plane_color_mgmt: Maps DRM color management to DC plane. ++ * @crtc: amdgpu_dm crtc state ++ * @plane_state: DRM plane state ++ * @dc_plane_state: target DC surface ++ * ++ * Update the underlying dc_stream_state's input transfer function (ITF) in ++ * preparation for hardware commit. The transfer function used depends on ++ * the preparation done on the stream for color management. ++ * ++ * Returns: ++ * 0 on success. -ENOMEM if mem allocation fails. ++ */ ++int amdgpu_dm_update_plane_color_mgmt(struct dm_crtc_state *crtc, ++ struct drm_plane_state *plane_state, ++ struct dc_plane_state *dc_plane_state) ++{ ++ struct amdgpu_device *adev = drm_to_adev(crtc->base.state->dev); ++ struct dm_plane_state *dm_plane_state = to_dm_plane_state(plane_state); ++ struct drm_color_ctm2 *ctm = NULL; ++ struct dc_color_caps *color_caps = NULL; ++ bool has_crtc_cm_degamma; ++ int ret; ++ ++ ret = amdgpu_dm_verify_lut3d_size(adev, plane_state); ++ if (ret) { ++ drm_dbg_driver(&adev->ddev, "amdgpu_dm_verify_lut3d_size() failed\n"); ++ return ret; ++ } ++ ++ if (dc_plane_state->ctx && dc_plane_state->ctx->dc) ++ color_caps = &dc_plane_state->ctx->dc->caps.color; ++ ++ /* Initially, we can just bypass the DGM block. */ ++ dc_plane_state->in_transfer_func->type = TF_TYPE_BYPASS; ++ dc_plane_state->in_transfer_func->tf = TRANSFER_FUNCTION_LINEAR; ++ ++ /* After, we start to update values according to color props */ ++ has_crtc_cm_degamma = (crtc->cm_has_degamma || crtc->cm_is_degamma_srgb); ++ ++ ret = __set_dm_plane_degamma(plane_state, dc_plane_state, color_caps); ++ if (ret == -ENOMEM) ++ return ret; ++ ++ /* We only have one degamma block available (pre-blending) for the ++ * whole color correction pipeline, so that we can't actually perform ++ * plane and CRTC degamma at the same time. Explicitly reject atomic ++ * updates when userspace sets both plane and CRTC degamma properties. ++ */ ++ if (has_crtc_cm_degamma && ret != -EINVAL){ ++ drm_dbg_kms(crtc->base.crtc->dev, ++ "doesn't support plane and CRTC degamma at the same time\n"); ++ return -EINVAL; ++ } ++ ++ /* If we are here, it means we don't have plane degamma settings, check ++ * if we have CRTC degamma waiting for mapping to pre-blending degamma ++ * block ++ */ ++ if (has_crtc_cm_degamma) { ++ /* AMD HW doesn't have post-blending degamma caps. When DRM ++ * CRTC atomic degamma is set, we maps it to DPP degamma block ++ * (pre-blending) or, on legacy gamma, we use DPP degamma to ++ * linearize (implicit degamma) from sRGB/BT709 according to ++ * the input space. ++ */ ++ ret = map_crtc_degamma_to_dc_plane(crtc, dc_plane_state, color_caps); ++ if (ret) ++ return ret; ++ } ++ ++ /* Setup CRTC CTM. */ ++ if (dm_plane_state->ctm) { ++ ctm = (struct drm_color_ctm2 *)dm_plane_state->ctm->data; ++ ++ /* ++ * So far, if we have both plane and CRTC CTM, plane CTM takes ++ * the priority and we discard data for CRTC CTM, as ++ * implemented in dcn10_program_gamut_remap(). However, we ++ * have MPC gamut_remap_matrix from DCN3 family, therefore we ++ * can remap MPC programing of the matrix to MPC block and ++ * provide support for both DPP and MPC matrix at the same ++ * time. ++ */ ++ __drm_ctm2_to_dc_matrix(ctm, dc_plane_state->gamut_remap_matrix.matrix); ++ ++ dc_plane_state->gamut_remap_matrix.enable_remap = true; ++ dc_plane_state->input_csc_color_matrix.enable_adjustment = false; ++ } else { ++ /* Bypass CTM. */ ++ dc_plane_state->gamut_remap_matrix.enable_remap = false; ++ dc_plane_state->input_csc_color_matrix.enable_adjustment = false; ++ } ++ ++ return amdgpu_dm_plane_set_color_properties(plane_state, ++ dc_plane_state, color_caps); ++} +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c +index 30d4c6fd95f5..e7b38cce010c 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c +@@ -253,6 +253,7 @@ static struct drm_crtc_state *dm_crtc_duplicate_state(struct drm_crtc *crtc) + state->freesync_config = cur->freesync_config; + state->cm_has_degamma = cur->cm_has_degamma; + state->cm_is_degamma_srgb = cur->cm_is_degamma_srgb; ++ state->regamma_tf = cur->regamma_tf; + state->crc_skip_count = cur->crc_skip_count; + state->mpo_requested = cur->mpo_requested; + /* TODO Duplicate dc_stream after objects are stream object is flattened */ +@@ -289,6 +290,70 @@ static int amdgpu_dm_crtc_late_register(struct drm_crtc *crtc) + } + #endif + ++#ifdef AMD_PRIVATE_COLOR ++/** ++ * drm_crtc_additional_color_mgmt - enable additional color properties ++ * @crtc: DRM CRTC ++ * ++ * This function lets the driver enable post-blending CRTC regamma transfer ++ * function property in addition to DRM CRTC gamma LUT. Default value means ++ * linear transfer function, which is the default CRTC gamma LUT behaviour ++ * without this property. ++ */ ++static void ++dm_crtc_additional_color_mgmt(struct drm_crtc *crtc) ++{ ++ struct amdgpu_device *adev = drm_to_adev(crtc->dev); ++ ++ if(adev->dm.dc->caps.color.mpc.ogam_ram) ++ drm_object_attach_property(&crtc->base, ++ adev->mode_info.regamma_tf_property, ++ AMDGPU_TRANSFER_FUNCTION_DEFAULT); ++} ++ ++static int ++amdgpu_dm_atomic_crtc_set_property(struct drm_crtc *crtc, ++ struct drm_crtc_state *state, ++ struct drm_property *property, ++ uint64_t val) ++{ ++ struct amdgpu_device *adev = drm_to_adev(crtc->dev); ++ struct dm_crtc_state *acrtc_state = to_dm_crtc_state(state); ++ ++ if (property == adev->mode_info.regamma_tf_property) { ++ if (acrtc_state->regamma_tf != val) { ++ acrtc_state->regamma_tf = val; ++ acrtc_state->base.color_mgmt_changed |= 1; ++ } ++ } else { ++ drm_dbg_atomic(crtc->dev, ++ "[CRTC:%d:%s] unknown property [PROP:%d:%s]]\n", ++ crtc->base.id, crtc->name, ++ property->base.id, property->name); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int ++amdgpu_dm_atomic_crtc_get_property(struct drm_crtc *crtc, ++ const struct drm_crtc_state *state, ++ struct drm_property *property, ++ uint64_t *val) ++{ ++ struct amdgpu_device *adev = drm_to_adev(crtc->dev); ++ struct dm_crtc_state *acrtc_state = to_dm_crtc_state(state); ++ ++ if (property == adev->mode_info.regamma_tf_property) ++ *val = acrtc_state->regamma_tf; ++ else ++ return -EINVAL; ++ ++ return 0; ++} ++#endif ++ + /* Implemented only the options currently available for the driver */ + static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = { + .reset = dm_crtc_reset_state, +@@ -307,6 +372,10 @@ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = { + #if defined(CONFIG_DEBUG_FS) + .late_register = amdgpu_dm_crtc_late_register, + #endif ++#ifdef AMD_PRIVATE_COLOR ++ .atomic_set_property = amdgpu_dm_atomic_crtc_set_property, ++ .atomic_get_property = amdgpu_dm_atomic_crtc_get_property, ++#endif + }; + + static void dm_crtc_helper_disable(struct drm_crtc *crtc) +@@ -482,6 +551,9 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm, + + drm_mode_crtc_set_gamma_size(&acrtc->base, MAX_COLOR_LEGACY_LUT_ENTRIES); + ++#ifdef AMD_PRIVATE_COLOR ++ dm_crtc_additional_color_mgmt(&acrtc->base); ++#endif + return 0; + + fail: +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c +index 322668973747..60e5ffb1863d 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c +@@ -1317,8 +1317,14 @@ static void dm_drm_plane_reset(struct drm_plane *plane) + amdgpu_state = kzalloc(sizeof(*amdgpu_state), GFP_KERNEL); + WARN_ON(amdgpu_state == NULL); + +- if (amdgpu_state) +- __drm_atomic_helper_plane_reset(plane, &amdgpu_state->base); ++ if (!amdgpu_state) ++ return; ++ ++ __drm_atomic_helper_plane_reset(plane, &amdgpu_state->base); ++ amdgpu_state->degamma_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT; ++ amdgpu_state->hdr_mult = AMDGPU_HDR_MULT_DEFAULT; ++ amdgpu_state->shaper_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT; ++ amdgpu_state->blend_tf = AMDGPU_TRANSFER_FUNCTION_DEFAULT; + } + + static struct drm_plane_state * +@@ -1338,6 +1344,22 @@ dm_drm_plane_duplicate_state(struct drm_plane *plane) + dc_plane_state_retain(dm_plane_state->dc_state); + } + ++ if (dm_plane_state->degamma_lut) ++ drm_property_blob_get(dm_plane_state->degamma_lut); ++ if (dm_plane_state->ctm) ++ drm_property_blob_get(dm_plane_state->ctm); ++ if (dm_plane_state->shaper_lut) ++ drm_property_blob_get(dm_plane_state->shaper_lut); ++ if (dm_plane_state->lut3d) ++ drm_property_blob_get(dm_plane_state->lut3d); ++ if (dm_plane_state->blend_lut) ++ drm_property_blob_get(dm_plane_state->blend_lut); ++ ++ dm_plane_state->degamma_tf = old_dm_plane_state->degamma_tf; ++ dm_plane_state->hdr_mult = old_dm_plane_state->hdr_mult; ++ dm_plane_state->shaper_tf = old_dm_plane_state->shaper_tf; ++ dm_plane_state->blend_tf = old_dm_plane_state->blend_tf; ++ + return &dm_plane_state->base; + } + +@@ -1405,12 +1427,203 @@ static void dm_drm_plane_destroy_state(struct drm_plane *plane, + { + struct dm_plane_state *dm_plane_state = to_dm_plane_state(state); + ++ if (dm_plane_state->degamma_lut) ++ drm_property_blob_put(dm_plane_state->degamma_lut); ++ if (dm_plane_state->ctm) ++ drm_property_blob_put(dm_plane_state->ctm); ++ if (dm_plane_state->lut3d) ++ drm_property_blob_put(dm_plane_state->lut3d); ++ if (dm_plane_state->shaper_lut) ++ drm_property_blob_put(dm_plane_state->shaper_lut); ++ if (dm_plane_state->blend_lut) ++ drm_property_blob_put(dm_plane_state->blend_lut); ++ + if (dm_plane_state->dc_state) + dc_plane_state_release(dm_plane_state->dc_state); + + drm_atomic_helper_plane_destroy_state(plane, state); + } + ++#ifdef AMD_PRIVATE_COLOR ++static void ++dm_atomic_plane_attach_color_mgmt_properties(struct amdgpu_display_manager *dm, ++ struct drm_plane *plane) ++{ ++ struct amdgpu_mode_info mode_info = dm->adev->mode_info; ++ struct dpp_color_caps dpp_color_caps = dm->dc->caps.color.dpp; ++ ++ /* Check HW color pipeline capabilities for DPP (pre-blending) before expose*/ ++ if (dpp_color_caps.dgam_ram || dpp_color_caps.gamma_corr) { ++ drm_object_attach_property(&plane->base, ++ mode_info.plane_degamma_lut_property, 0); ++ drm_object_attach_property(&plane->base, ++ mode_info.plane_degamma_lut_size_property, ++ MAX_COLOR_LUT_ENTRIES); ++ drm_object_attach_property(&plane->base, ++ dm->adev->mode_info.plane_degamma_tf_property, ++ AMDGPU_TRANSFER_FUNCTION_DEFAULT); ++ } ++ /* HDR MULT is always available */ ++ drm_object_attach_property(&plane->base, ++ dm->adev->mode_info.plane_hdr_mult_property, ++ AMDGPU_HDR_MULT_DEFAULT); ++ ++ /* Only enable plane CTM if both DPP and MPC gamut remap is available. */ ++ if (dm->dc->caps.color.mpc.gamut_remap) ++ drm_object_attach_property(&plane->base, ++ dm->adev->mode_info.plane_ctm_property, 0); ++ ++ if (dpp_color_caps.hw_3d_lut) { ++ drm_object_attach_property(&plane->base, ++ mode_info.plane_shaper_lut_property, 0); ++ drm_object_attach_property(&plane->base, ++ mode_info.plane_shaper_lut_size_property, ++ MAX_COLOR_LUT_ENTRIES); ++ drm_object_attach_property(&plane->base, ++ mode_info.plane_shaper_tf_property, ++ AMDGPU_TRANSFER_FUNCTION_DEFAULT); ++ drm_object_attach_property(&plane->base, ++ mode_info.plane_lut3d_property, 0); ++ drm_object_attach_property(&plane->base, ++ mode_info.plane_lut3d_size_property, ++ MAX_COLOR_3DLUT_ENTRIES); ++ } ++ ++ if (dpp_color_caps.ogam_ram) { ++ drm_object_attach_property(&plane->base, ++ mode_info.plane_blend_lut_property, 0); ++ drm_object_attach_property(&plane->base, ++ mode_info.plane_blend_lut_size_property, ++ MAX_COLOR_LUT_ENTRIES); ++ drm_object_attach_property(&plane->base, ++ mode_info.plane_blend_tf_property, ++ AMDGPU_TRANSFER_FUNCTION_DEFAULT); ++ } ++} ++ ++static int ++dm_atomic_plane_set_property(struct drm_plane *plane, ++ struct drm_plane_state *state, ++ struct drm_property *property, ++ uint64_t val) ++{ ++ struct dm_plane_state *dm_plane_state = to_dm_plane_state(state); ++ struct amdgpu_device *adev = drm_to_adev(plane->dev); ++ bool replaced = false; ++ int ret; ++ ++ if (property == adev->mode_info.plane_degamma_lut_property) { ++ ret = drm_property_replace_blob_from_id(plane->dev, ++ &dm_plane_state->degamma_lut, ++ val, ++ -1, sizeof(struct drm_color_lut), ++ &replaced); ++ dm_plane_state->base.color_mgmt_changed |= replaced; ++ return ret; ++ } else if (property == adev->mode_info.plane_degamma_tf_property) { ++ if (dm_plane_state->degamma_tf != val) { ++ dm_plane_state->degamma_tf = val; ++ dm_plane_state->base.color_mgmt_changed = 1; ++ } ++ } else if (property == adev->mode_info.plane_hdr_mult_property) { ++ if (dm_plane_state->hdr_mult != val) { ++ dm_plane_state->hdr_mult = val; ++ dm_plane_state->base.color_mgmt_changed = 1; ++ } ++ } else if (property == adev->mode_info.plane_ctm_property) { ++ ret = drm_property_replace_blob_from_id(plane->dev, ++ &dm_plane_state->ctm, ++ val, ++ sizeof(struct drm_color_ctm2), -1, ++ &replaced); ++ dm_plane_state->base.color_mgmt_changed |= replaced; ++ return ret; ++ } else if (property == adev->mode_info.plane_shaper_lut_property) { ++ ret = drm_property_replace_blob_from_id(plane->dev, ++ &dm_plane_state->shaper_lut, ++ val, -1, ++ sizeof(struct drm_color_lut), ++ &replaced); ++ dm_plane_state->base.color_mgmt_changed |= replaced; ++ return ret; ++ } else if (property == adev->mode_info.plane_shaper_tf_property) { ++ if (dm_plane_state->shaper_tf != val) { ++ dm_plane_state->shaper_tf = val; ++ dm_plane_state->base.color_mgmt_changed = 1; ++ } ++ } else if (property == adev->mode_info.plane_lut3d_property) { ++ ret = drm_property_replace_blob_from_id(plane->dev, ++ &dm_plane_state->lut3d, ++ val, -1, ++ sizeof(struct drm_color_lut), ++ &replaced); ++ dm_plane_state->base.color_mgmt_changed |= replaced; ++ return ret; ++ } else if (property == adev->mode_info.plane_blend_lut_property) { ++ ret = drm_property_replace_blob_from_id(plane->dev, ++ &dm_plane_state->blend_lut, ++ val, -1, ++ sizeof(struct drm_color_lut), ++ &replaced); ++ dm_plane_state->base.color_mgmt_changed |= replaced; ++ return ret; ++ } else if (property == adev->mode_info.plane_blend_tf_property) { ++ if (dm_plane_state->blend_tf != val) { ++ dm_plane_state->blend_tf = val; ++ dm_plane_state->base.color_mgmt_changed = 1; ++ } ++ } else { ++ drm_dbg_atomic(plane->dev, ++ "[PLANE:%d:%s] unknown property [PROP:%d:%s]]\n", ++ plane->base.id, plane->name, ++ property->base.id, property->name); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int ++dm_atomic_plane_get_property(struct drm_plane *plane, ++ const struct drm_plane_state *state, ++ struct drm_property *property, ++ uint64_t *val) ++{ ++ struct dm_plane_state *dm_plane_state = to_dm_plane_state(state); ++ struct amdgpu_device *adev = drm_to_adev(plane->dev); ++ ++ if (property == adev->mode_info.plane_degamma_lut_property) { ++ *val = (dm_plane_state->degamma_lut) ? ++ dm_plane_state->degamma_lut->base.id : 0; ++ } else if (property == adev->mode_info.plane_degamma_tf_property) { ++ *val = dm_plane_state->degamma_tf; ++ } else if (property == adev->mode_info.plane_hdr_mult_property) { ++ *val = dm_plane_state->hdr_mult; ++ } else if (property == adev->mode_info.plane_ctm_property) { ++ *val = (dm_plane_state->ctm) ? ++ dm_plane_state->ctm->base.id : 0; ++ } else if (property == adev->mode_info.plane_shaper_lut_property) { ++ *val = (dm_plane_state->shaper_lut) ? ++ dm_plane_state->shaper_lut->base.id : 0; ++ } else if (property == adev->mode_info.plane_shaper_tf_property) { ++ *val = dm_plane_state->shaper_tf; ++ } else if (property == adev->mode_info.plane_lut3d_property) { ++ *val = (dm_plane_state->lut3d) ? ++ dm_plane_state->lut3d->base.id : 0; ++ } else if (property == adev->mode_info.plane_blend_lut_property) { ++ *val = (dm_plane_state->blend_lut) ? ++ dm_plane_state->blend_lut->base.id : 0; ++ } else if (property == adev->mode_info.plane_blend_tf_property) { ++ *val = dm_plane_state->blend_tf; ++ ++ } else { ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++#endif ++ + static const struct drm_plane_funcs dm_plane_funcs = { + .update_plane = drm_atomic_helper_update_plane, + .disable_plane = drm_atomic_helper_disable_plane, +@@ -1419,6 +1632,10 @@ static const struct drm_plane_funcs dm_plane_funcs = { + .atomic_duplicate_state = dm_drm_plane_duplicate_state, + .atomic_destroy_state = dm_drm_plane_destroy_state, + .format_mod_supported = dm_plane_format_mod_supported, ++#ifdef AMD_PRIVATE_COLOR ++ .atomic_set_property = dm_atomic_plane_set_property, ++ .atomic_get_property = dm_atomic_plane_get_property, ++#endif + }; + + int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm, +@@ -1489,6 +1706,9 @@ int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm, + + drm_plane_helper_add(plane, &dm_plane_helper_funcs); + ++#ifdef AMD_PRIVATE_COLOR ++ dm_atomic_plane_attach_color_mgmt_properties(dm, plane); ++#endif + /* Create (reset) the plane state */ + if (plane->funcs->reset) + plane->funcs->reset(plane); +diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c +index 3538973bd0c6..04b2e04b68f3 100644 +--- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c ++++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c +@@ -349,20 +349,37 @@ bool cm_helper_translate_curve_to_hw_format(struct dc_context *ctx, + * segment is from 2^-10 to 2^1 + * There are less than 256 points, for optimization + */ +- seg_distr[0] = 3; +- seg_distr[1] = 4; +- seg_distr[2] = 4; +- seg_distr[3] = 4; +- seg_distr[4] = 4; +- seg_distr[5] = 4; +- seg_distr[6] = 4; +- seg_distr[7] = 4; +- seg_distr[8] = 4; +- seg_distr[9] = 4; +- seg_distr[10] = 1; +- +- region_start = -10; +- region_end = 1; ++ if (output_tf->tf == TRANSFER_FUNCTION_LINEAR) { ++ seg_distr[0] = 0; /* 2 */ ++ seg_distr[1] = 1; /* 4 */ ++ seg_distr[2] = 2; /* 4 */ ++ seg_distr[3] = 3; /* 8 */ ++ seg_distr[4] = 4; /* 16 */ ++ seg_distr[5] = 5; /* 32 */ ++ seg_distr[6] = 6; /* 64 */ ++ seg_distr[7] = 7; /* 128 */ ++ ++ region_start = -8; ++ region_end = 1; ++ } else { ++ seg_distr[0] = 3; /* 8 */ ++ seg_distr[1] = 4; /* 16 */ ++ seg_distr[2] = 4; ++ seg_distr[3] = 4; ++ seg_distr[4] = 4; ++ seg_distr[5] = 4; ++ seg_distr[6] = 4; ++ seg_distr[7] = 4; ++ seg_distr[8] = 4; ++ seg_distr[9] = 4; ++ seg_distr[10] = 1; /* 2 */ ++ /* total = 8*16 + 8 + 64 + 2 = */ ++ ++ region_start = -10; ++ region_end = 1; ++ } ++ ++ + } + + for (i = region_end - region_start; i < MAX_REGIONS_NUMBER ; i++) +@@ -375,16 +392,56 @@ bool cm_helper_translate_curve_to_hw_format(struct dc_context *ctx, + + j = 0; + for (k = 0; k < (region_end - region_start); k++) { +- increment = NUMBER_SW_SEGMENTS / (1 << seg_distr[k]); ++ /* ++ * We're using an ugly-ish hack here. Our HW allows for ++ * 256 segments per region but SW_SEGMENTS is 16. ++ * SW_SEGMENTS has some undocumented relationship to ++ * the number of points in the tf_pts struct, which ++ * is 512, unlike what's suggested TRANSFER_FUNC_POINTS. ++ * ++ * In order to work past this dilemma we'll scale our ++ * increment by (1 << 4) and then do the inverse (1 >> 4) ++ * when accessing the elements in tf_pts. ++ * ++ * TODO: find a better way using SW_SEGMENTS and ++ * TRANSFER_FUNC_POINTS definitions ++ */ ++ increment = (NUMBER_SW_SEGMENTS << 4) / (1 << seg_distr[k]); + start_index = (region_start + k + MAX_LOW_POINT) * + NUMBER_SW_SEGMENTS; +- for (i = start_index; i < start_index + NUMBER_SW_SEGMENTS; ++ for (i = (start_index << 4); i < (start_index << 4) + (NUMBER_SW_SEGMENTS << 4); + i += increment) { ++ struct fixed31_32 in_plus_one, in; ++ struct fixed31_32 value, red_value, green_value, blue_value; ++ uint32_t t = i & 0xf; ++ + if (j == hw_points - 1) + break; +- rgb_resulted[j].red = output_tf->tf_pts.red[i]; +- rgb_resulted[j].green = output_tf->tf_pts.green[i]; +- rgb_resulted[j].blue = output_tf->tf_pts.blue[i]; ++ ++ in_plus_one = output_tf->tf_pts.red[(i >> 4) + 1]; ++ in = output_tf->tf_pts.red[i >> 4]; ++ value = dc_fixpt_sub(in_plus_one, in); ++ value = dc_fixpt_shr(dc_fixpt_mul_int(value, t), 4); ++ value = dc_fixpt_add(in, value); ++ red_value = value; ++ ++ in_plus_one = output_tf->tf_pts.green[(i >> 4) + 1]; ++ in = output_tf->tf_pts.green[i >> 4]; ++ value = dc_fixpt_sub(in_plus_one, in); ++ value = dc_fixpt_shr(dc_fixpt_mul_int(value, t), 4); ++ value = dc_fixpt_add(in, value); ++ green_value = value; ++ ++ in_plus_one = output_tf->tf_pts.blue[(i >> 4) + 1]; ++ in = output_tf->tf_pts.blue[i >> 4]; ++ value = dc_fixpt_sub(in_plus_one, in); ++ value = dc_fixpt_shr(dc_fixpt_mul_int(value, t), 4); ++ value = dc_fixpt_add(in, value); ++ blue_value = value; ++ ++ rgb_resulted[j].red = red_value; ++ rgb_resulted[j].green = green_value; ++ rgb_resulted[j].blue = blue_value; + j++; + } + } +diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c +index bf8864bc8a99..72558eb877dc 100644 +--- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c ++++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c +@@ -186,6 +186,43 @@ bool dcn30_set_input_transfer_func(struct dc *dc, + return result; + } + ++void dcn30_program_gamut_remap(struct pipe_ctx *pipe_ctx) ++{ ++ int i = 0; ++ struct dpp_grph_csc_adjustment dpp_adjust; ++ struct mpc_grph_gamut_adjustment mpc_adjust; ++ int mpcc_id = pipe_ctx->plane_res.hubp->inst; ++ struct mpc *mpc = pipe_ctx->stream_res.opp->ctx->dc->res_pool->mpc; ++ ++ memset(&dpp_adjust, 0, sizeof(dpp_adjust)); ++ dpp_adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_BYPASS; ++ ++ if (pipe_ctx->plane_state && ++ pipe_ctx->plane_state->gamut_remap_matrix.enable_remap == true) { ++ dpp_adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_SW; ++ for (i = 0; i < CSC_TEMPERATURE_MATRIX_SIZE; i++) ++ dpp_adjust.temperature_matrix[i] = ++ pipe_ctx->plane_state->gamut_remap_matrix.matrix[i]; ++ } ++ ++ pipe_ctx->plane_res.dpp->funcs->dpp_set_gamut_remap(pipe_ctx->plane_res.dpp, ++ &dpp_adjust); ++ ++ memset(&mpc_adjust, 0, sizeof(mpc_adjust)); ++ mpc_adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_BYPASS; ++ ++ if (pipe_ctx->top_pipe == NULL) { ++ if (pipe_ctx->stream->gamut_remap_matrix.enable_remap == true) { ++ mpc_adjust.gamut_adjust_type = GRAPHICS_GAMUT_ADJUST_TYPE_SW; ++ for (i = 0; i < CSC_TEMPERATURE_MATRIX_SIZE; i++) ++ mpc_adjust.temperature_matrix[i] = ++ pipe_ctx->stream->gamut_remap_matrix.matrix[i]; ++ } ++ } ++ ++ mpc->funcs->set_gamut_remap(mpc, mpcc_id, &mpc_adjust); ++} ++ + bool dcn30_set_output_transfer_func(struct dc *dc, + struct pipe_ctx *pipe_ctx, + const struct dc_stream_state *stream) +diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h +index a24a8e33a3d2..cb34ca932a5f 100644 +--- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h ++++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h +@@ -58,6 +58,9 @@ bool dcn30_set_blend_lut(struct pipe_ctx *pipe_ctx, + bool dcn30_set_input_transfer_func(struct dc *dc, + struct pipe_ctx *pipe_ctx, + const struct dc_plane_state *plane_state); ++ ++void dcn30_program_gamut_remap(struct pipe_ctx *pipe_ctx); ++ + bool dcn30_set_output_transfer_func(struct dc *dc, + struct pipe_ctx *pipe_ctx, + const struct dc_stream_state *stream); +diff --git a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c +index 257df8660b4c..81fd50ee97c3 100644 +--- a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c ++++ b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c +@@ -33,7 +33,7 @@ + #include "dcn301_init.h" + + static const struct hw_sequencer_funcs dcn301_funcs = { +- .program_gamut_remap = dcn10_program_gamut_remap, ++ .program_gamut_remap = dcn30_program_gamut_remap, + .init_hw = dcn10_init_hw, + .power_down_on_boot = dcn10_power_down_on_boot, + .apply_ctx_to_hw = dce110_apply_ctx_to_hw, +diff --git a/drivers/gpu/drm/amd/display/include/fixed31_32.h b/drivers/gpu/drm/amd/display/include/fixed31_32.h +index d4cf7ead1d87..84da1dd34efd 100644 +--- a/drivers/gpu/drm/amd/display/include/fixed31_32.h ++++ b/drivers/gpu/drm/amd/display/include/fixed31_32.h +@@ -69,6 +69,18 @@ static const struct fixed31_32 dc_fixpt_epsilon = { 1LL }; + static const struct fixed31_32 dc_fixpt_half = { 0x80000000LL }; + static const struct fixed31_32 dc_fixpt_one = { 0x100000000LL }; + ++static inline struct fixed31_32 dc_fixpt_from_s3132(__u64 x) ++{ ++ struct fixed31_32 val; ++ ++ /* If negative, convert to 2's complement. */ ++ if (x & (1ULL << 63)) ++ x = -(x & ~(1ULL << 63)); ++ ++ val.value = x; ++ return val; ++} ++ + /* + * @brief + * Initialization routines +diff --git a/drivers/gpu/drm/arm/malidp_crtc.c b/drivers/gpu/drm/arm/malidp_crtc.c +index dc01c43f6193..d72c22dcf685 100644 +--- a/drivers/gpu/drm/arm/malidp_crtc.c ++++ b/drivers/gpu/drm/arm/malidp_crtc.c +@@ -221,7 +221,7 @@ static int malidp_crtc_atomic_check_ctm(struct drm_crtc *crtc, + + /* + * The size of the ctm is checked in +- * drm_atomic_replace_property_blob_from_id. ++ * drm_property_replace_blob_from_id. + */ + ctm = (struct drm_color_ctm *)state->ctm->data; + for (i = 0; i < ARRAY_SIZE(ctm->matrix); ++i) { +diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c +index c277b198fa3f..c3df45f90145 100644 +--- a/drivers/gpu/drm/drm_atomic.c ++++ b/drivers/gpu/drm/drm_atomic.c +@@ -733,6 +733,7 @@ static void drm_atomic_plane_print_state(struct drm_printer *p, + drm_get_color_encoding_name(state->color_encoding)); + drm_printf(p, "\tcolor-range=%s\n", + drm_get_color_range_name(state->color_range)); ++ drm_printf(p, "\tcolor_mgmt_changed=%d\n", state->color_mgmt_changed); + + if (plane->funcs->atomic_print_state) + plane->funcs->atomic_print_state(p, state); +diff --git a/drivers/gpu/drm/drm_atomic_state_helper.c b/drivers/gpu/drm/drm_atomic_state_helper.c +index 784e63d70a42..25bb0859fda7 100644 +--- a/drivers/gpu/drm/drm_atomic_state_helper.c ++++ b/drivers/gpu/drm/drm_atomic_state_helper.c +@@ -338,6 +338,7 @@ void __drm_atomic_helper_plane_duplicate_state(struct drm_plane *plane, + state->fence = NULL; + state->commit = NULL; + state->fb_damage_clips = NULL; ++ state->color_mgmt_changed = false; + } + EXPORT_SYMBOL(__drm_atomic_helper_plane_duplicate_state); + +diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c +index d867e7f9f2cd..a6a9ee5086dd 100644 +--- a/drivers/gpu/drm/drm_atomic_uapi.c ++++ b/drivers/gpu/drm/drm_atomic_uapi.c +@@ -362,39 +362,6 @@ static s32 __user *get_out_fence_for_connector(struct drm_atomic_state *state, + return fence_ptr; + } + +-static int +-drm_atomic_replace_property_blob_from_id(struct drm_device *dev, +- struct drm_property_blob **blob, +- uint64_t blob_id, +- ssize_t expected_size, +- ssize_t expected_elem_size, +- bool *replaced) +-{ +- struct drm_property_blob *new_blob = NULL; +- +- if (blob_id != 0) { +- new_blob = drm_property_lookup_blob(dev, blob_id); +- if (new_blob == NULL) +- return -EINVAL; +- +- if (expected_size > 0 && +- new_blob->length != expected_size) { +- drm_property_blob_put(new_blob); +- return -EINVAL; +- } +- if (expected_elem_size > 0 && +- new_blob->length % expected_elem_size != 0) { +- drm_property_blob_put(new_blob); +- return -EINVAL; +- } +- } +- +- *replaced |= drm_property_replace_blob(blob, new_blob); +- drm_property_blob_put(new_blob); +- +- return 0; +-} +- + static int drm_atomic_crtc_set_property(struct drm_crtc *crtc, + struct drm_crtc_state *state, struct drm_property *property, + uint64_t val) +@@ -415,7 +382,7 @@ static int drm_atomic_crtc_set_property(struct drm_crtc *crtc, + } else if (property == config->prop_vrr_enabled) { + state->vrr_enabled = val; + } else if (property == config->degamma_lut_property) { +- ret = drm_atomic_replace_property_blob_from_id(dev, ++ ret = drm_property_replace_blob_from_id(dev, + &state->degamma_lut, + val, + -1, sizeof(struct drm_color_lut), +@@ -423,7 +390,7 @@ static int drm_atomic_crtc_set_property(struct drm_crtc *crtc, + state->color_mgmt_changed |= replaced; + return ret; + } else if (property == config->ctm_property) { +- ret = drm_atomic_replace_property_blob_from_id(dev, ++ ret = drm_property_replace_blob_from_id(dev, + &state->ctm, + val, + sizeof(struct drm_color_ctm), -1, +@@ -431,7 +398,7 @@ static int drm_atomic_crtc_set_property(struct drm_crtc *crtc, + state->color_mgmt_changed |= replaced; + return ret; + } else if (property == config->gamma_lut_property) { +- ret = drm_atomic_replace_property_blob_from_id(dev, ++ ret = drm_property_replace_blob_from_id(dev, + &state->gamma_lut, + val, + -1, sizeof(struct drm_color_lut), +@@ -563,7 +530,7 @@ static int drm_atomic_plane_set_property(struct drm_plane *plane, + } else if (property == plane->color_range_property) { + state->color_range = val; + } else if (property == config->prop_fb_damage_clips) { +- ret = drm_atomic_replace_property_blob_from_id(dev, ++ ret = drm_property_replace_blob_from_id(dev, + &state->fb_damage_clips, + val, + -1, +@@ -729,7 +696,7 @@ static int drm_atomic_connector_set_property(struct drm_connector *connector, + if (state->link_status != DRM_LINK_STATUS_GOOD) + state->link_status = val; + } else if (property == config->hdr_output_metadata_property) { +- ret = drm_atomic_replace_property_blob_from_id(dev, ++ ret = drm_property_replace_blob_from_id(dev, + &state->hdr_output_metadata, + val, + sizeof(struct hdr_output_metadata), -1, +diff --git a/drivers/gpu/drm/drm_property.c b/drivers/gpu/drm/drm_property.c +index dfec479830e4..f72ef6493340 100644 +--- a/drivers/gpu/drm/drm_property.c ++++ b/drivers/gpu/drm/drm_property.c +@@ -751,6 +751,55 @@ bool drm_property_replace_blob(struct drm_property_blob **blob, + } + EXPORT_SYMBOL(drm_property_replace_blob); + ++/** ++ * drm_property_replace_blob_from_id - replace a blob property taking a reference ++ * @dev: DRM device ++ * @blob: a pointer to the member blob to be replaced ++ * @blob_id: the id of the new blob to replace with ++ * @expected_size: expected size of the blob property ++ * @expected_elem_size: expected size of an element in the blob property ++ * @replaced: if the blob was in fact replaced ++ * ++ * Look up the new blob from id, take its reference, check expected sizes of ++ * the blob and its element and replace the old blob by the new one. Advertise ++ * if the replacement operation was successful. ++ * ++ * Return: true if the blob was in fact replaced. -EINVAL if the new blob was ++ * not found or sizes don't match. ++ */ ++int drm_property_replace_blob_from_id(struct drm_device *dev, ++ struct drm_property_blob **blob, ++ uint64_t blob_id, ++ ssize_t expected_size, ++ ssize_t expected_elem_size, ++ bool *replaced) ++{ ++ struct drm_property_blob *new_blob = NULL; ++ ++ if (blob_id != 0) { ++ new_blob = drm_property_lookup_blob(dev, blob_id); ++ if (new_blob == NULL) ++ return -EINVAL; ++ ++ if (expected_size > 0 && ++ new_blob->length != expected_size) { ++ drm_property_blob_put(new_blob); ++ return -EINVAL; ++ } ++ if (expected_elem_size > 0 && ++ new_blob->length % expected_elem_size != 0) { ++ drm_property_blob_put(new_blob); ++ return -EINVAL; ++ } ++ } ++ ++ *replaced |= drm_property_replace_blob(blob, new_blob); ++ drm_property_blob_put(new_blob); ++ ++ return 0; ++} ++EXPORT_SYMBOL(drm_property_replace_blob_from_id); ++ + int drm_mode_getblob_ioctl(struct drm_device *dev, + void *data, struct drm_file *file_priv) + { +diff --git a/include/drm/drm_mode_object.h b/include/drm/drm_mode_object.h +index 912f1e415685..08d7a7f0188f 100644 +--- a/include/drm/drm_mode_object.h ++++ b/include/drm/drm_mode_object.h +@@ -60,7 +60,7 @@ struct drm_mode_object { + void (*free_cb)(struct kref *kref); + }; + +-#define DRM_OBJECT_MAX_PROPERTY 24 ++#define DRM_OBJECT_MAX_PROPERTY 64 + /** + * struct drm_object_properties - property tracking for &drm_mode_object + */ +diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h +index 51291983ea44..52c3287da0da 100644 +--- a/include/drm/drm_plane.h ++++ b/include/drm/drm_plane.h +@@ -237,6 +237,13 @@ struct drm_plane_state { + + /** @state: backpointer to global drm_atomic_state */ + struct drm_atomic_state *state; ++ ++ /** ++ * @color_mgmt_changed: Color management properties have changed. Used ++ * by the atomic helpers and drivers to steer the atomic commit control ++ * flow. ++ */ ++ bool color_mgmt_changed : 1; + }; + + static inline struct drm_rect +diff --git a/include/drm/drm_property.h b/include/drm/drm_property.h +index 65bc9710a470..082f29156b3e 100644 +--- a/include/drm/drm_property.h ++++ b/include/drm/drm_property.h +@@ -279,6 +279,12 @@ struct drm_property_blob *drm_property_create_blob(struct drm_device *dev, + const void *data); + struct drm_property_blob *drm_property_lookup_blob(struct drm_device *dev, + uint32_t id); ++int drm_property_replace_blob_from_id(struct drm_device *dev, ++ struct drm_property_blob **blob, ++ uint64_t blob_id, ++ ssize_t expected_size, ++ ssize_t expected_elem_size, ++ bool *replaced); + int drm_property_replace_global_blob(struct drm_device *dev, + struct drm_property_blob **replace, + size_t length, +diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h +index 43691058d28f..23fc19400998 100644 +--- a/include/uapi/drm/drm_mode.h ++++ b/include/uapi/drm/drm_mode.h +@@ -843,6 +843,14 @@ struct drm_color_ctm { + __u64 matrix[9]; + }; + ++struct drm_color_ctm2 { ++ /* ++ * Conversion matrix in S31.32 sign-magnitude ++ * (not two's complement!) format. ++ */ ++ __u64 matrix[12]; ++}; ++ + struct drm_color_lut { + /* + * Values are mapped linearly to 0.0 - 1.0 range, with 0x0 == 0.0 and +-- +2.42.0 + +From f43591177032844d0dec73debda8218267d6d2ef Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 28 Aug 2023 14:01:19 +0200 +Subject: [PATCH 2/7] amd-pref-core Signed-off-by: Peter Jung --- .../admin-guide/kernel-parameters.txt | 5 + - Documentation/admin-guide/pm/amd-pstate.rst | 54 +++++++ - arch/x86/Kconfig | 3 +- + Documentation/admin-guide/pm/amd-pstate.rst | 53 ++++++ + arch/x86/Kconfig | 5 +- drivers/acpi/cppc_acpi.c | 13 ++ drivers/acpi/processor_driver.c | 6 + drivers/cpufreq/amd-pstate-ut.c | 50 +++--- @@ -16,10 +2148,10 @@ Signed-off-by: Peter Jung include/acpi/cppc_acpi.h | 5 + include/linux/amd-pstate.h | 1 + include/linux/cpufreq.h | 4 + - 11 files changed, 259 insertions(+), 47 deletions(-) + 11 files changed, 259 insertions(+), 48 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 722b6eca2e938..ac95d4c9666e4 100644 +index 722b6eca2e93..ac95d4c9666e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -363,6 +363,11 @@ @@ -35,10 +2167,10 @@ index 722b6eca2e938..ac95d4c9666e4 100644 Map of devices attached to JOY0DAT and JOY1DAT Format: , diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst -index 1cf40f69278cd..ef2b69935311f 100644 +index 1cf40f69278c..2369b58a3521 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst -@@ -353,6 +353,48 @@ is activated. In this mode, driver requests minimum and maximum performance +@@ -353,6 +353,47 @@ is activated. In this mode, driver requests minimum and maximum performance level and the platform autonomously selects a performance level in this range and appropriate to the current workload. @@ -80,14 +2212,13 @@ index 1cf40f69278cd..ef2b69935311f 100644 + +``amd_prefcore=disable`` + -+If ``amd_prefcore=disable`` is passed to kernel command line option -+then disable ``AMD Pstate Preferred Core`` if platform can support -+the Preferred Core feature. ++``AMD Pstate Preferred Core`` will be enabled if the underlying platform ++supports it. It can be disabled by kernerl parameter: ``amd_prefcore=disable``. + User Space Interface in ``sysfs`` - General =========================================== -@@ -385,6 +427,18 @@ control its functionality at the system level. They are located in the +@@ -385,6 +426,18 @@ control its functionality at the system level. They are located in the to the operation mode represented by that string - or to be unregistered in the "disable" case. @@ -107,7 +2238,7 @@ index 1cf40f69278cd..ef2b69935311f 100644 =============================================== diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index e36261b4ea14f..03322d2840faa 100644 +index e36261b4ea14..16df141bd8a2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1052,8 +1052,9 @@ config SCHED_MC @@ -115,14 +2246,15 @@ index e36261b4ea14f..03322d2840faa 100644 config SCHED_MC_PRIO bool "CPU core priorities scheduler support" - depends on SCHED_MC && CPU_SUP_INTEL +- select X86_INTEL_PSTATE + depends on SCHED_MC - select X86_INTEL_PSTATE -+ select X86_AMD_PSTATE ++ select X86_INTEL_PSTATE if CPU_SUP_INTEL ++ select X86_AMD_PSTATE if CPU_SUP_AMD select CPU_FREQ default y help diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c -index 7ff269a78c208..ad388a0e84842 100644 +index 7ff269a78c20..ad388a0e8484 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1154,6 +1154,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) @@ -146,7 +2278,7 @@ index 7ff269a78c208..ad388a0e84842 100644 * cppc_get_epp_perf - Get the epp register value. * @cpunum: CPU from which to get epp preference value. diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c -index 4bd16b3f07814..29b2fb68a35db 100644 +index 4bd16b3f0781..29b2fb68a35d 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -27,6 +27,7 @@ @@ -170,7 +2302,7 @@ index 4bd16b3f07814..29b2fb68a35db 100644 acpi_handle_debug(handle, "Unsupported event [0x%x]\n", event); break; diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c -index 7f3fe20489818..f04ae67dda372 100644 +index 7f3fe2048981..f04ae67dda37 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -64,27 +64,9 @@ static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = { @@ -303,7 +2435,7 @@ index 7f3fe20489818..f04ae67dda372 100644 static int __init amd_pstate_ut_init(void) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c -index 9a1e194d5cf88..8a8e4ecb1b5c6 100644 +index 9a1e194d5cf8..8a8e4ecb1b5c 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -37,6 +37,7 @@ @@ -572,7 +2704,7 @@ index 9a1e194d5cf88..8a8e4ecb1b5c6 100644 MODULE_AUTHOR("Huang Rui "); MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver"); diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c -index 50bbc969ffe53..842357abfae60 100644 +index 50bbc969ffe5..842357abfae6 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2675,6 +2675,19 @@ void cpufreq_update_limits(unsigned int cpu) @@ -596,7 +2728,7 @@ index 50bbc969ffe53..842357abfae60 100644 * BOOST * *********************************************************************/ diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h -index 6126c977ece04..c0b69ffe7bdb4 100644 +index 6126c977ece0..c0b69ffe7bdb 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -139,6 +139,7 @@ struct cppc_cpudata { @@ -619,7 +2751,7 @@ index 6126c977ece04..c0b69ffe7bdb4 100644 { return -ENOTSUPP; diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h -index 446394f846064..fa86bc953d3e0 100644 +index 446394f84606..fa86bc953d3e 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h @@ -70,6 +70,7 @@ struct amd_cpudata { @@ -631,7 +2763,7 @@ index 446394f846064..fa86bc953d3e0 100644 u32 max_freq; u32 min_freq; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h -index 172ff51c1b2a4..766c83a4fae74 100644 +index 172ff51c1b2a..766c83a4fae7 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -231,6 +231,7 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); @@ -653,12 +2785,12 @@ index 172ff51c1b2a4..766c83a4fae74 100644 int (*bios_limit)(int cpu, unsigned int *limit); -- -2.41.0 +2.42.0 -From 85c40edbbd82439d1ca1e367eed47ad58119a341 Mon Sep 17 00:00:00 2001 +From b35ba9f5a6ca4ac70053f1120b2042daa320ea59 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Sun, 13 Aug 2023 22:53:18 +0200 -Subject: [PATCH 2/6] bbr3 +Subject: [PATCH 3/7] bbr3 Signed-off-by: Peter Jung --- @@ -680,7 +2812,7 @@ Signed-off-by: Peter Jung 15 files changed, 1934 insertions(+), 551 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h -index 91a37c99ba665..ae0ee688c3f7b 100644 +index 91a37c99ba66..ae0ee688c3f7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -255,7 +255,9 @@ struct tcp_sock { @@ -695,7 +2827,7 @@ index 91a37c99ba665..ae0ee688c3f7b 100644 u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h -index c2b15f7e55161..a400a84088d38 100644 +index c2b15f7e5516..a400a84088d3 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -135,8 +135,8 @@ struct inet_connection_sock { @@ -710,7 +2842,7 @@ index c2b15f7e55161..a400a84088d38 100644 #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ diff --git a/include/net/tcp.h b/include/net/tcp.h -index 0ca972ebd3dd0..8eb194559b701 100644 +index 0ca972ebd3dd..8eb194559b70 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -370,6 +370,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk, @@ -882,7 +3014,7 @@ index 0ca972ebd3dd0..8eb194559b701 100644 static inline void tcp_plb_init(const struct sock *sk, struct tcp_plb_state *plb) diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h -index 50655de04c9b6..82f8bd8f0d161 100644 +index 50655de04c9b..82f8bd8f0d16 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -229,6 +229,29 @@ struct tcp_bbr_info { @@ -916,7 +3048,7 @@ index 50655de04c9b6..82f8bd8f0d161 100644 union tcp_cc_info { diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h -index 51c13cf9c5aee..de8dcba26becc 100644 +index 51c13cf9c5ae..de8dcba26bec 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -506,9 +506,11 @@ enum { @@ -933,7 +3065,7 @@ index 51c13cf9c5aee..de8dcba26becc 100644 struct rta_session { __u8 proto; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h -index 879eeb0a084b4..77270053a5e39 100644 +index 879eeb0a084b..77270053a5e3 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail { @@ -945,7 +3077,7 @@ index 879eeb0a084b4..77270053a5e39 100644 /* * Sender's congestion state indicating normal or abnormal situations diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index 2dfb12230f089..2e14db3bee704 100644 +index 2dfb12230f08..2e14db3bee70 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -668,15 +668,18 @@ config TCP_CONG_BBR @@ -977,7 +3109,7 @@ index 2dfb12230f089..2e14db3bee704 100644 choice prompt "Default TCP congestion control" diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index 8ed52e1e3c99a..0198ac17f3a8f 100644 +index 8ed52e1e3c99..0198ac17f3a8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3083,6 +3083,7 @@ int tcp_disconnect(struct sock *sk, int flags) @@ -998,7 +3130,7 @@ index 8ed52e1e3c99a..0198ac17f3a8f 100644 info->tcpi_options |= TCPI_OPT_SYN_DATA; diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c -index 146792cd26fed..f4f477a69917d 100644 +index 146792cd26fe..f4f477a69917 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1,18 +1,19 @@ @@ -3643,7 +5775,7 @@ index 146792cd26fed..f4f477a69917d 100644 MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); +MODULE_VERSION(__stringify(BBR_VERSION)); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c -index 1b34050a7538b..66d40449b3f4f 100644 +index 1b34050a7538..66d40449b3f4 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -241,6 +241,7 @@ void tcp_init_congestion_control(struct sock *sk) @@ -3655,7 +5787,7 @@ index 1b34050a7538b..66d40449b3f4f 100644 icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c -index 57c8af1859c16..2195ba488142a 100644 +index 57c8af1859c1..2195ba488142 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -349,7 +349,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) @@ -3789,7 +5921,7 @@ index 57c8af1859c16..2195ba488142a 100644 tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c -index c8f2aa0033871..fdf51e436899f 100644 +index c8f2aa003387..fdf51e436899 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -440,6 +440,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) @@ -3802,7 +5934,7 @@ index c8f2aa0033871..fdf51e436899f 100644 const struct tcp_congestion_ops *ca; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 51d8638d4b4c6..2fb064057868a 100644 +index 51d8638d4b4c..2fb064057868 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -325,10 +325,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) @@ -3913,7 +6045,7 @@ index 51d8638d4b4c6..2fb064057868a 100644 goto rearm_timer; diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c -index a8f6d9d06f2eb..8737f21346481 100644 +index a8f6d9d06f2e..8737f2134648 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -34,6 +34,24 @@ @@ -3993,7 +6125,7 @@ index a8f6d9d06f2eb..8737f21346481 100644 rs->interval_us = max(snd_us, ack_us); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c -index 206418b6d7c48..619069963ff07 100644 +index 206418b6d7c4..619069963ff0 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -626,6 +626,7 @@ void tcp_write_timer_handler(struct sock *sk) @@ -4005,12 +6137,12 @@ index 206418b6d7c48..619069963ff07 100644 event = icsk->icsk_pending; -- -2.41.0 +2.42.0 -From 7f942a85c0cc0c584314cee751f793e8a7dc93ba Mon Sep 17 00:00:00 2001 +From 41db757e2b0e00035bdd9692a6b5d143eac1d33e Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 20 Aug 2023 15:54:59 +0200 -Subject: [PATCH 3/6] cachy +Date: Mon, 28 Aug 2023 14:01:56 +0200 +Subject: [PATCH 4/7] cachy Signed-off-by: Peter Jung --- @@ -4071,7 +6203,7 @@ Signed-off-by: Peter Jung create mode 100644 drivers/platform/x86/steamdeck.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index ac95d4c9666e4..b3eecf5b94f40 100644 +index ac95d4c9666e..b3eecf5b94f4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4276,6 +4276,15 @@ @@ -4091,7 +6223,7 @@ index ac95d4c9666e4..b3eecf5b94f40 100644 Safety option to keep boot IRQs enabled. This should never be necessary. diff --git a/Makefile b/Makefile -index 4739c21a63e2e..daf528173b398 100644 +index 2fdd8b40b7e0..8a601d85cd3f 100644 --- a/Makefile +++ b/Makefile @@ -831,6 +831,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) @@ -4117,7 +6249,7 @@ index 4739c21a63e2e..daf528173b398 100644 KBUILD_CFLAGS += -Werror=date-time diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig -index 81764160451f7..2c15d3bf747a9 100644 +index 81764160451f..2c15d3bf747a 100644 --- a/arch/arc/configs/axs101_defconfig +++ b/arch/arc/configs/axs101_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y @@ -4129,7 +6261,7 @@ index 81764160451f7..2c15d3bf747a9 100644 CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig -index d5181275490ed..7d868e148d9a4 100644 +index d5181275490e..7d868e148d9a 100644 --- a/arch/arc/configs/axs103_defconfig +++ b/arch/arc/configs/axs103_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y @@ -4141,7 +6273,7 @@ index d5181275490ed..7d868e148d9a4 100644 CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig -index 07c89281c2e3a..1513324ddb008 100644 +index 07c89281c2e3..1513324ddb00 100644 --- a/arch/arc/configs/axs103_smp_defconfig +++ b/arch/arc/configs/axs103_smp_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y @@ -4153,7 +6285,7 @@ index 07c89281c2e3a..1513324ddb008 100644 CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig -index 8c3ed5d6e6c35..2db643853e8f4 100644 +index 8c3ed5d6e6c3..2db643853e8f 100644 --- a/arch/arc/configs/haps_hs_defconfig +++ b/arch/arc/configs/haps_hs_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y @@ -4165,7 +6297,7 @@ index 8c3ed5d6e6c35..2db643853e8f4 100644 CONFIG_PERF_EVENTS=y # CONFIG_COMPAT_BRK is not set diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig -index 61107e8bac336..d764007e5adad 100644 +index 61107e8bac33..d764007e5ada 100644 --- a/arch/arc/configs/haps_hs_smp_defconfig +++ b/arch/arc/configs/haps_hs_smp_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y @@ -4177,7 +6309,7 @@ index 61107e8bac336..d764007e5adad 100644 CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig -index 4ee2a1507b57f..ce6a4431a76dd 100644 +index 4ee2a1507b57..ce6a4431a76d 100644 --- a/arch/arc/configs/hsdk_defconfig +++ b/arch/arc/configs/hsdk_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y @@ -4189,7 +6321,7 @@ index 4ee2a1507b57f..ce6a4431a76dd 100644 CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig -index 3e98297759925..5044609540cc3 100644 +index 3e9829775992..5044609540cc 100644 --- a/arch/arc/configs/nsim_700_defconfig +++ b/arch/arc/configs/nsim_700_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y @@ -4201,7 +6333,7 @@ index 3e98297759925..5044609540cc3 100644 CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig -index 502c87f351c87..748c809d1c4c6 100644 +index 502c87f351c8..748c809d1c4c 100644 --- a/arch/arc/configs/nsimosci_defconfig +++ b/arch/arc/configs/nsimosci_defconfig @@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y @@ -4213,7 +6345,7 @@ index 502c87f351c87..748c809d1c4c6 100644 CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig -index f721cc3997d02..205c32b0074ca 100644 +index f721cc3997d0..205c32b0074c 100644 --- a/arch/arc/configs/nsimosci_hs_defconfig +++ b/arch/arc/configs/nsimosci_hs_defconfig @@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y @@ -4225,7 +6357,7 @@ index f721cc3997d02..205c32b0074ca 100644 CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig -index 1419fc946a083..2477b7c809771 100644 +index 1419fc946a08..2477b7c80977 100644 --- a/arch/arc/configs/nsimosci_hs_smp_defconfig +++ b/arch/arc/configs/nsimosci_hs_smp_defconfig @@ -8,6 +8,7 @@ CONFIG_IKCONFIG_PROC=y @@ -4237,7 +6369,7 @@ index 1419fc946a083..2477b7c809771 100644 # CONFIG_COMPAT_BRK is not set CONFIG_KPROBES=y diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig -index 941bbadd6bf2c..e61132ba4f890 100644 +index 941bbadd6bf2..e61132ba4f89 100644 --- a/arch/arc/configs/tb10x_defconfig +++ b/arch/arc/configs/tb10x_defconfig @@ -14,6 +14,7 @@ CONFIG_INITRAMFS_SOURCE="../tb10x-rootfs.cpio" @@ -4249,7 +6381,7 @@ index 941bbadd6bf2c..e61132ba4f890 100644 # CONFIG_AIO is not set CONFIG_EMBEDDED=y diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig -index d3ef189c75f8b..922b1b24f5184 100644 +index d3ef189c75f8..922b1b24f518 100644 --- a/arch/arc/configs/vdk_hs38_defconfig +++ b/arch/arc/configs/vdk_hs38_defconfig @@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y @@ -4261,7 +6393,7 @@ index d3ef189c75f8b..922b1b24f5184 100644 CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig -index 944b347025fd1..ed64319f7eb29 100644 +index 944b347025fd..ed64319f7eb2 100644 --- a/arch/arc/configs/vdk_hs38_smp_defconfig +++ b/arch/arc/configs/vdk_hs38_smp_defconfig @@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y @@ -4273,7 +6405,7 @@ index 944b347025fd1..ed64319f7eb29 100644 CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu -index 00468adf180f1..46cc91cb622fc 100644 +index 00468adf180f..46cc91cb622f 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -157,7 +157,7 @@ config MPENTIUM4 @@ -4806,7 +6938,7 @@ index 00468adf180f1..46cc91cb622fc 100644 config IA32_FEAT_CTL def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile -index fdc2e3abd6152..63845db8bf8a5 100644 +index fdc2e3abd615..63845db8bf8a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -67,7 +67,7 @@ export BITS @@ -4870,7 +7002,7 @@ index fdc2e3abd6152..63845db8bf8a5 100644 KBUILD_CFLAGS += $(cflags-y) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h -index b40c462b4af36..c4e66e60d559d 100644 +index b40c462b4af3..c4e66e60d559 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -27,6 +27,7 @@ struct pci_sysdata { @@ -4894,7 +7026,7 @@ index b40c462b4af36..c4e66e60d559d 100644 already-configured bus numbers - to be used for buggy BIOSes or architectures with incomplete PCI setup by the loader */ diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h -index 75884d2cdec37..02c1386eb653e 100644 +index 75884d2cdec3..02c1386eb653 100644 --- a/arch/x86/include/asm/vermagic.h +++ b/arch/x86/include/asm/vermagic.h @@ -17,6 +17,54 @@ @@ -4986,7 +7118,7 @@ index 75884d2cdec37..02c1386eb653e 100644 #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c -index ddb798603201e..7c20387d82029 100644 +index ddb798603201..7c20387d8202 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void) @@ -5008,7 +7140,7 @@ index ddb798603201e..7c20387d82029 100644 } -#endif diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index 3cce6de464a7b..9176bc4f07daa 100644 +index 3cce6de464a7..9176bc4f07da 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7627,6 +7627,7 @@ MODULE_ALIAS("bfq-iosched"); @@ -5032,7 +7164,7 @@ index 3cce6de464a7b..9176bc4f07daa 100644 slab_kill: diff --git a/drivers/Makefile b/drivers/Makefile -index 7241d80a7b293..ac0ca3498f43e 100644 +index 7241d80a7b29..ac0ca3498f43 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -64,15 +64,8 @@ obj-y += char/ @@ -5067,7 +7199,7 @@ index 7241d80a7b293..ac0ca3498f43e 100644 obj-$(CONFIG_MTD) += mtd/ obj-$(CONFIG_SPI) += spi/ diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c -index addba109406be..f819ee132ffa2 100644 +index addba109406b..f819ee132ffa 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -1522,7 +1522,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance) @@ -5123,7 +7255,7 @@ index addba109406be..f819ee132ffa2 100644 sysfs_add_file_to_group(&pdev->dev.kobj, &dev_attr_remapped_nvme.attr, diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 -index 438c9e75a04dc..1bbfeca5f01ec 100644 +index 438c9e75a04d..1bbfeca5f01e 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -9,7 +9,6 @@ config X86_INTEL_PSTATE @@ -5143,7 +7275,7 @@ index 438c9e75a04dc..1bbfeca5f01ec 100644 This driver adds a CPUFreq driver which utilizes a fine grain processor performance frequency control range instead of legacy diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig -index 9cfe8fc509d7d..efc3b0c0b4adb 100644 +index 9cfe8fc509d7..efc3b0c0b4ad 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -229,6 +229,15 @@ config I2C_CHT_WC @@ -5163,7 +7295,7 @@ index 9cfe8fc509d7d..efc3b0c0b4adb 100644 tristate "Nvidia nForce2, nForce3 and nForce4" depends on PCI diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile -index af56fe2c75c09..76be74584719e 100644 +index af56fe2c75c0..76be74584719 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC) += i2c-cht-wc.o @@ -5176,7 +7308,7 @@ index af56fe2c75c09..76be74584719e 100644 obj-$(CONFIG_I2C_NVIDIA_GPU) += i2c-nvidia-gpu.o diff --git a/drivers/i2c/busses/i2c-nct6775.c b/drivers/i2c/busses/i2c-nct6775.c new file mode 100644 -index 0000000000000..0462f09520431 +index 000000000000..0462f0952043 --- /dev/null +++ b/drivers/i2c/busses/i2c-nct6775.c @@ -0,0 +1,647 @@ @@ -5828,7 +7960,7 @@ index 0000000000000..0462f09520431 +module_init(i2c_nct6775_init); +module_exit(i2c_nct6775_exit); diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c -index 809fbd014cd68..d54b35b147ee9 100644 +index 809fbd014cd6..d54b35b147ee 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -568,11 +568,11 @@ static int piix4_transaction(struct i2c_adapter *piix4_adapter) @@ -5846,7 +7978,7 @@ index 809fbd014cd68..d54b35b147ee9 100644 /* If the SMBus is still busy, we give up */ if (timeout == MAX_TIMEOUT) { diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c -index 1dc6227d353ec..bab1009ccef79 100644 +index 1dc6227d353e..bab1009ccef7 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3240,6 +3240,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) @@ -5862,7 +7994,7 @@ index 1dc6227d353ec..bab1009ccef79 100644 if (ret < 0) goto bad; diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile -index 37c8663de7fe1..897d19f92edeb 100644 +index 37c8663de7fe..897d19f92ede 100644 --- a/drivers/pci/controller/Makefile +++ b/drivers/pci/controller/Makefile @@ -1,4 +1,10 @@ @@ -5878,7 +8010,7 @@ index 37c8663de7fe1..897d19f92edeb 100644 obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o diff --git a/drivers/pci/controller/intel-nvme-remap.c b/drivers/pci/controller/intel-nvme-remap.c new file mode 100644 -index 0000000000000..e105e6f5cc91d +index 000000000000..e105e6f5cc91 --- /dev/null +++ b/drivers/pci/controller/intel-nvme-remap.c @@ -0,0 +1,462 @@ @@ -6345,7 +8477,7 @@ index 0000000000000..e105e6f5cc91d +MODULE_AUTHOR("Daniel Drake "); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c -index 321156ca273d5..5dda26c737e2c 100644 +index 321156ca273d..5dda26c737e2 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3718,6 +3718,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) @@ -6464,7 +8596,7 @@ index 321156ca273d5..5dda26c737e2c 100644 }; diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig -index 49c2c4cd8d000..956f4eff85b5b 100644 +index 49c2c4cd8d00..956f4eff85b5 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -643,6 +643,16 @@ config THINKPAD_LMI @@ -6506,7 +8638,7 @@ index 49c2c4cd8d000..956f4eff85b5b 100644 config P2SB diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile -index 52dfdf574ac2d..d32b6d87219ff 100644 +index 52dfdf574ac2..d32b6d87219f 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -66,6 +66,7 @@ obj-$(CONFIG_SENSORS_HDAPS) += hdaps.o @@ -6526,7 +8658,7 @@ index 52dfdf574ac2d..d32b6d87219ff 100644 +obj-$(CONFIG_STEAMDECK) += steamdeck.o diff --git a/drivers/platform/x86/legion-laptop.c b/drivers/platform/x86/legion-laptop.c new file mode 100644 -index 0000000000000..d1268d239cc5f +index 000000000000..d1268d239cc5 --- /dev/null +++ b/drivers/platform/x86/legion-laptop.c @@ -0,0 +1,2783 @@ @@ -9315,7 +11447,7 @@ index 0000000000000..d1268d239cc5f +module_exit(legion_exit); diff --git a/drivers/platform/x86/steamdeck.c b/drivers/platform/x86/steamdeck.c new file mode 100644 -index 0000000000000..77a6677ec19e6 +index 000000000000..77a6677ec19e --- /dev/null +++ b/drivers/platform/x86/steamdeck.c @@ -0,0 +1,523 @@ @@ -9843,7 +11975,7 @@ index 0000000000000..77a6677ec19e6 +MODULE_DESCRIPTION("Steam Deck ACPI platform driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mm.h b/include/linux/mm.h -index 406ab9ea818fe..17794c2130550 100644 +index 34f9dba17c1a..4527f319019a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -191,7 +191,7 @@ static inline void __mm_zero_struct_page(struct page *page) @@ -9856,7 +11988,7 @@ index 406ab9ea818fe..17794c2130550 100644 extern int sysctl_max_map_count; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h -index 716953ee1ebdb..dace360dc38d7 100644 +index 716953ee1ebd..dace360dc38d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1181,7 +1181,7 @@ struct readahead_control { @@ -9869,7 +12001,7 @@ index 716953ee1ebdb..dace360dc38d7 100644 void page_cache_ra_unbounded(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_count); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h -index 45f09bec02c48..87b20e2ee2744 100644 +index 45f09bec02c4..87b20e2ee274 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -148,6 +148,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, @@ -9891,7 +12023,7 @@ index 45f09bec02c48..87b20e2ee2744 100644 { return &init_user_ns; diff --git a/init/Kconfig b/init/Kconfig -index f7f65af4ee129..71755cc8ed3e4 100644 +index f7f65af4ee12..71755cc8ed3e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -123,6 +123,10 @@ config THREAD_INFO_IN_TASK @@ -9942,7 +12074,7 @@ index f7f65af4ee129..71755cc8ed3e4 100644 bool "Optimize for size (-Os)" help diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 38ef6d06888ef..0f78364efd4f2 100644 +index 38ef6d06888e..0f78364efd4f 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -40,6 +40,27 @@ choice @@ -9984,7 +12116,7 @@ index 38ef6d06888ef..0f78364efd4f2 100644 config SCHED_HRTICK diff --git a/kernel/fork.c b/kernel/fork.c -index d2e12b6d2b180..95ca80492a379 100644 +index d2e12b6d2b18..95ca80492a37 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -100,6 +100,10 @@ @@ -10023,7 +12155,7 @@ index d2e12b6d2b180..95ca80492a379 100644 if (err) goto bad_unshare_out; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index b3e25be58e2b7..2c335df301718 100644 +index b3e25be58e2b..2c335df30171 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -69,9 +69,13 @@ @@ -10083,7 +12215,7 @@ index b3e25be58e2b7..2c335df301718 100644 #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 354a2d294f526..4dc780aa3bcc8 100644 +index 354a2d294f52..4dc780aa3bcc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -95,6 +95,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); @@ -10113,7 +12245,7 @@ index 354a2d294f526..4dc780aa3bcc8 100644 { .procname = "tainted", diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index 1d8e47bed3f11..fec01d016a351 100644 +index 1d8e47bed3f1..fec01d016a35 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -22,6 +22,13 @@ @@ -10131,7 +12263,7 @@ index 1d8e47bed3f11..fec01d016a351 100644 static DEFINE_MUTEX(userns_state_mutex); diff --git a/mm/Kconfig b/mm/Kconfig -index 09130434e30d3..f772ba88df878 100644 +index 09130434e30d..f772ba88df87 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -631,7 +631,7 @@ config COMPACTION @@ -10144,7 +12276,7 @@ index 09130434e30d3..f772ba88df878 100644 # diff --git a/mm/page-writeback.c b/mm/page-writeback.c -index d3f42009bb702..39b9fd0606304 100644 +index d3f42009bb70..39b9fd060630 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -71,7 +71,11 @@ static long ratelimit_pages = 32; @@ -10172,7 +12304,7 @@ index d3f42009bb702..39b9fd0606304 100644 EXPORT_SYMBOL_GPL(dirty_writeback_interval); diff --git a/mm/swap.c b/mm/swap.c -index cd8f0150ba3aa..42c405a4f114c 100644 +index cd8f0150ba3a..42c405a4f114 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1090,6 +1090,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) @@ -10193,7 +12325,7 @@ index cd8f0150ba3aa..42c405a4f114c 100644 +#endif } diff --git a/mm/vmpressure.c b/mm/vmpressure.c -index b52644771cc43..11a4b0e3b583c 100644 +index b52644771cc4..11a4b0e3b583 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; @@ -10209,7 +12341,7 @@ index b52644771cc43..11a4b0e3b583c 100644 /* diff --git a/mm/vmscan.c b/mm/vmscan.c -index 1080209a568bb..f76aa82682152 100644 +index 2fe4a11d63f4..445ce9324b01 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -186,7 +186,11 @@ struct scan_control { @@ -10224,7 +12356,7 @@ index 1080209a568bb..f76aa82682152 100644 LIST_HEAD(shrinker_list); DECLARE_RWSEM(shrinker_rwsem); -@@ -4593,7 +4597,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc +@@ -4594,7 +4598,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc } /* to protect the working set of the last N jiffies */ @@ -10237,24 +12369,334 @@ index 1080209a568bb..f76aa82682152 100644 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { -- -2.41.0 +2.42.0 -From 84774938778953b047ed348f924e2c9fae19e5cc Mon Sep 17 00:00:00 2001 +From b05442522d6f62443d6bbd57d68868d96910ee2e Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 20 Aug 2023 15:55:14 +0200 -Subject: [PATCH 4/6] fixes +Date: Mon, 28 Aug 2023 14:02:22 +0200 +Subject: [PATCH 5/7] fixes Signed-off-by: Peter Jung --- - drivers/bluetooth/btusb.c | 2 +- - include/linux/pageblock-flags.h | 2 +- - kernel/padata.c | 4 ++-- - mm/readahead.c | 10 +++++++++- - sound/pci/hda/cs35l41_hda.c | 2 +- - 5 files changed, 14 insertions(+), 6 deletions(-) + Documentation/ABI/stable/sysfs-block | 10 + + .../testing/sysfs-class-led-trigger-blkdev | 78 ++ + Documentation/leds/index.rst | 1 + + Documentation/leds/ledtrig-blkdev.rst | 158 +++ + block/mq-deadline.c | 3 +- + drivers/bluetooth/btusb.c | 2 +- + drivers/char/tpm/tpm_crb.c | 33 +- + drivers/leds/trigger/Kconfig | 9 + + drivers/leds/trigger/Makefile | 1 + + drivers/leds/trigger/ledtrig-blkdev.c | 1218 +++++++++++++++++ + drivers/pinctrl/pinctrl-amd.c | 4 +- + include/linux/pageblock-flags.h | 2 +- + kernel/padata.c | 4 +- + mm/readahead.c | 10 +- + scripts/Makefile.vmlinux_o | 2 +- + sound/pci/hda/cs35l41_hda.c | 2 +- + 16 files changed, 1502 insertions(+), 35 deletions(-) + create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev + create mode 100644 Documentation/leds/ledtrig-blkdev.rst + create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c +diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block +index c57e5b7cb532..2d1df6c9b463 100644 +--- a/Documentation/ABI/stable/sysfs-block ++++ b/Documentation/ABI/stable/sysfs-block +@@ -101,6 +101,16 @@ Description: + devices that support receiving integrity metadata. + + ++What: /sys/block//linked_leds ++Date: January 2023 ++Contact: Ian Pilcher ++Description: ++ Directory that contains symbolic links to all LEDs that ++ are associated with (linked to) this block device by the ++ blkdev LED trigger. Only present when at least one LED ++ is linked. (See Documentation/leds/ledtrig-blkdev.rst.) ++ ++ + What: /sys/block///alignment_offset + Date: April 2009 + Contact: Martin K. Petersen +diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev +new file mode 100644 +index 000000000000..28ce8c814fb7 +--- /dev/null ++++ b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev +@@ -0,0 +1,78 @@ ++What: /sys/class/leds//blink_time ++Date: January 2023 ++Contact: Ian Pilcher ++Description: ++ Time (in milliseconds) that the LED will be on during a single ++ "blink". ++ ++What: /sys/class/leds//check_interval ++Date: January 2023 ++Contact: Ian Pilcher ++Description: ++ Interval (in milliseconds) between checks of the block devices ++ linked to this LED. The LED will be blinked if the correct type ++ of activity (see blink_on_{read,write,discard,flush} attributes) ++ has occurred on any of the linked devices since the previous ++ check. ++ ++What: /sys/class/leds//blink_on_read ++Date: January 2023 ++Contact: Ian Pilcher ++Description: ++ Boolean that determines whether the LED will blink in response ++ to read activity on any of its linked block devices. ++ ++What: /sys/class/leds//blink_on_write ++Date: January 2023 ++Contact: Ian Pilcher ++Description: ++ Boolean that determines whether the LED will blink in response ++ to write activity on any of its linked block devices. ++ ++What: /sys/class/leds//blink_on_discard ++Date: January 2023 ++Contact: Ian Pilcher ++Description: ++ Boolean that determines whether the LED will blink in response ++ to discard activity on any of its linked block devices. ++ ++What: /sys/class/leds//blink_on_flush ++Date: January 2023 ++Contact: Ian Pilcher ++Description: ++ Boolean that determines whether the LED will blink in response ++ to cache flush activity on any of its linked block devices. ++ ++What: /sys/class/leds//link_dev_by_path ++Date: January 2023 ++Contact: Ian Pilcher ++Description: ++ Associate a block device with this LED by writing the path to ++ the device special file (e.g. /dev/sda) to this attribute. ++ Symbolic links are followed. ++ ++What: /sys/class/leds//unlink_dev_by_path ++Date: January 2023 ++Contact: Ian Pilcher ++Description: ++ Remove the association between this LED and a block device by ++ writing the path to the device special file (e.g. /dev/sda) to ++ this attribute. Symbolic links are followed. ++ ++What: /sys/class/leds//unlink_dev_by_name ++Date: January 2023 ++Contact: Ian Pilcher ++Description: ++ Remove the association between this LED and a block device by ++ writing the kernel name of the device (e.g. sda) to this ++ attribute. ++ ++What: /sys/class/leds//linked_devices ++Date: January 2023 ++Contact: Ian Pilcher ++Description: ++ Directory containing links to all block devices that are ++ associated with this LED. (Note that the names of the ++ symbolic links in this directory are *kernel* names, which ++ may not match the device special file paths written to ++ link_device and unlink_device.) +diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst +index 3ade16c18328..3fd55a2cbfb5 100644 +--- a/Documentation/leds/index.rst ++++ b/Documentation/leds/index.rst +@@ -10,6 +10,7 @@ LEDs + leds-class + leds-class-flash + leds-class-multicolor ++ ledtrig-blkdev + ledtrig-oneshot + ledtrig-transient + ledtrig-usbport +diff --git a/Documentation/leds/ledtrig-blkdev.rst b/Documentation/leds/ledtrig-blkdev.rst +new file mode 100644 +index 000000000000..9ff5b99de451 +--- /dev/null ++++ b/Documentation/leds/ledtrig-blkdev.rst +@@ -0,0 +1,158 @@ ++.. SPDX-License-Identifier: GPL-2.0 ++ ++================================= ++Block Device (blkdev) LED Trigger ++================================= ++ ++Available when ``CONFIG_LEDS_TRIGGER_BLKDEV=y`` or ++``CONFIG_LEDS_TRIGGER_BLKDEV=m``. ++ ++See also: ++ ++* ``Documentation/ABI/testing/sysfs-class-led-trigger-blkdev`` ++* ``Documentation/ABI/stable/sysfs-block`` (``/sys/block//linked_leds``) ++ ++Overview ++======== ++ ++.. note:: ++ The examples below use ```` to refer to the name of a ++ system-specific LED. If no suitable LED is available on a test ++ system (in a virtual machine, for example), it is possible to ++ use a userspace LED. (See ``Documentation/leds/uleds.rst``.) ++ ++Verify that the ``blkdev`` LED trigger is available:: ++ ++ # grep blkdev /sys/class/leds//trigger ++ ... rfkill-none blkdev ++ ++(If the previous command produces no output, you may need to load the trigger ++module - ``modprobe ledtrig_blkdev``. If the module is not available, check ++the value of ``CONFIG_LEDS_TRIGGER_BLKDEV`` in your kernel configuration.) ++ ++Associate the LED with the ``blkdev`` LED trigger:: ++ ++ # echo blkdev > /sys/class/leds//trigger ++ ++ # cat /sys/class/leds//trigger ++ ... rfkill-none [blkdev] ++ ++Note that several new device attributes are available in the ++``/sys/class/leds/`` directory. ++ ++* ``link_dev_by_path``, ``unlink_dev_by_path``, and ``unlink_dev_by_name`` are ++ used to manage the set of block devices associated with this LED. The LED ++ will blink when activity occurs on any of its linked devices. ++ ++* ``blink_on_read``, ``blink_on_write``, ``blink_on_discard``, and ++ ``blink_on_flush`` are boolean values that determine whether the LED will ++ blink when a particular type of activity is detected on one of its linked ++ block devices. ++ ++* ``blink_time`` is the duration (in milliseconds) of each blink of this LED. ++ (The minimum value is 10 milliseconds.) ++ ++* ``check_interval`` is the frequency (in milliseconds) with which block devices ++ linked to this LED will be checked for activity and the LED blinked (if the ++ correct type of activity has occurred). ++ ++* The ``linked_devices`` directory will contain a symbolic link to every device ++ that is associated with this LED. ++ ++Link a block device to the LED:: ++ ++ # echo /dev/sda > /sys/class/leds//link_dev_by_path ++ ++ # ls /sys/class/leds//linked_devices ++ sda ++ ++(The value written to ``link_dev_by_path`` must be the path of the device ++special file, such as ``/dev/sda``, that represents the block device - or the ++path of a symbolic link to such a device special file.) ++ ++Activity on the device will now cause the LED to blink. The duration of each ++blink (in milliseconds) can be adjusted by setting ++``/sys/class/leds//blink_time``. (But see **check_interval and ++blink_time** below.) ++ ++Associate a second device with the LED:: ++ ++ # echo /dev/sdb > /sys/class/leds//link_dev_by_path ++ ++ # ls /sys/class/leds//linked_devices ++ sda sdb ++ ++When a block device is linked to one or more LEDs, the LEDs are linked from ++the device's ``linked_leds`` directory:: ++ ++ # ls /sys/class/block/sd{a,b}/linked_leds ++ /sys/class/block/sda/linked_leds: ++ ++ ++ /sys/class/block/sdb/linked_leds: ++ ++ ++(The ``linked_leds`` directory only exists when the block device is linked to ++at least one LED.) ++ ++``check_interval`` and ``blink_time`` ++===================================== ++ ++* By default, linked block devices are checked for activity every 100 ++ milliseconds. This frequency can be changed for an LED via the ++ ``/sys/class/leds//check_interval`` attribute. (The minimum value is 25 ++ milliseconds.) ++ ++* All block devices associated with an LED are checked for activity every ++ ``check_interval`` milliseconds, and a blink is triggered if the correct type ++ of activity (as determined by the LED's ``blink_on_*`` attributes) is ++ detected. The duration of an LED's blink is determined by its ``blink_time`` ++ attribute. Thus (when the correct type of activity is detected), the LED will ++ be on for ``blink_time`` milliseconds and off for ++ ``check_interval - blink_time`` milliseconds. ++ ++* The LED subsystem ignores new blink requests for an LED that is already in ++ in the process of blinking, so setting a ``blink_time`` greater than or equal ++ to ``check_interval`` will cause some blinks to be missed. ++ ++* Because of processing times, scheduling latencies, etc., avoiding missed ++ blinks actually requires a difference of at least a few milliseconds between ++ the ``blink_time`` and ``check_interval``. The required difference is likely ++ to vary from system to system. As a reference, a Thecus N5550 NAS requires a ++ difference of 7 milliseconds (e.g. ``check_interval == 100``, ++ ``blink_time == 93``). ++ ++* The default values (``check_interval == 100``, ``blink_time == 75``) cause the ++ LED associated with a continuously active device to blink rapidly. For a more ++ "always on" effect, increase the ``blink_time`` (but not too much; see the ++ previous bullet). ++ ++Other Notes ++=========== ++ ++* Many (possibly all) types of block devices work with this trigger, including: ++ ++ * SCSI (including SATA and USB) hard disk drives and SSDs ++ * SCSI (including SATA and USB) optical drives ++ * NVMe SSDs ++ * SD cards ++ * loopback block devices (``/dev/loop*``) ++ * device mapper devices, such as LVM logical volumes ++ * MD RAID devices ++ * zRAM compressed RAM-disks ++ * partitions on block devices that support them ++ ++* The names of the symbolic links in ``/sys/class/leds//linked_devices`` ++ are **kernel** names, which may not match the paths used for ++ ``link_dev_by_path`` and ``unlink_dev_by_path``. This is most likely when a ++ symbolic link is used to refer to the device (as is common with logical ++ volumes), but it can be true for any device, because nothing prevents the ++ creation of device special files with arbitrary names (e.g. ++ ``sudo mknod /foo b 8 0``). ++ ++ Kernel names can be used to unlink block devices from LEDs by writing them to ++ the LED's ``unlink_dev_by_name`` attribute. ++ ++* The ``blkdev`` LED trigger supports many-to-many device/LED associations. ++ A device can be associated with multiple LEDs, and an LED can be associated ++ with multiple devices. +diff --git a/block/mq-deadline.c b/block/mq-deadline.c +index 02a916ba62ee..f958e79277b8 100644 +--- a/block/mq-deadline.c ++++ b/block/mq-deadline.c +@@ -646,8 +646,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + struct blk_mq_tags *tags = hctx->sched_tags; ++ unsigned int shift = tags->bitmap_tags.sb.shift; + +- dd->async_depth = max(1UL, 3 * q->nr_requests / 4); ++ dd->async_depth = max(1U, 3 * (1U << shift) / 4); + + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); + } diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c -index 764d176e97351..deb10b89fa51f 100644 +index 764d176e9735..deb10b89fa51 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -945,7 +945,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev) @@ -10266,8 +12708,1332 @@ index 764d176e97351..deb10b89fa51f 100644 gpiod_set_value_cansleep(reset_gpio, 1); return; +diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c +index 9eb1a1859012..a5dbebb1acfc 100644 +--- a/drivers/char/tpm/tpm_crb.c ++++ b/drivers/char/tpm/tpm_crb.c +@@ -463,28 +463,6 @@ static bool crb_req_canceled(struct tpm_chip *chip, u8 status) + return (cancel & CRB_CANCEL_INVOKE) == CRB_CANCEL_INVOKE; + } + +-static int crb_check_flags(struct tpm_chip *chip) +-{ +- u32 val; +- int ret; +- +- ret = crb_request_locality(chip, 0); +- if (ret) +- return ret; +- +- ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val, NULL); +- if (ret) +- goto release; +- +- if (val == 0x414D4400U /* AMD */) +- chip->flags |= TPM_CHIP_FLAG_HWRNG_DISABLED; +- +-release: +- crb_relinquish_locality(chip, 0); +- +- return ret; +-} +- + static const struct tpm_class_ops tpm_crb = { + .flags = TPM_OPS_AUTO_STARTUP, + .status = crb_status, +@@ -826,9 +804,14 @@ static int crb_acpi_add(struct acpi_device *device) + if (rc) + goto out; + +- rc = crb_check_flags(chip); +- if (rc) +- goto out; ++#ifdef CONFIG_X86 ++ /* A quirk for https://www.amd.com/en/support/kb/faq/pa-410 */ ++ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && ++ priv->sm != ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON) { ++ dev_info(dev, "Disabling hwrng\n"); ++ chip->flags |= TPM_CHIP_FLAG_HWRNG_DISABLED; ++ } ++#endif /* CONFIG_X86 */ + + rc = tpm_chip_register(chip); + +diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig +index 2a57328eca20..05e80cfd0ed8 100644 +--- a/drivers/leds/trigger/Kconfig ++++ b/drivers/leds/trigger/Kconfig +@@ -155,4 +155,13 @@ config LEDS_TRIGGER_TTY + + When build as a module this driver will be called ledtrig-tty. + ++config LEDS_TRIGGER_BLKDEV ++ tristate "LED Trigger for block devices" ++ depends on BLOCK ++ help ++ The blkdev LED trigger allows LEDs to be controlled by block device ++ activity (reads and writes). ++ ++ See Documentation/leds/ledtrig-blkdev.rst. ++ + endif # LEDS_TRIGGERS +diff --git a/drivers/leds/trigger/Makefile b/drivers/leds/trigger/Makefile +index 25c4db97cdd4..d53bab5d93f1 100644 +--- a/drivers/leds/trigger/Makefile ++++ b/drivers/leds/trigger/Makefile +@@ -16,3 +16,4 @@ obj-$(CONFIG_LEDS_TRIGGER_NETDEV) += ledtrig-netdev.o + obj-$(CONFIG_LEDS_TRIGGER_PATTERN) += ledtrig-pattern.o + obj-$(CONFIG_LEDS_TRIGGER_AUDIO) += ledtrig-audio.o + obj-$(CONFIG_LEDS_TRIGGER_TTY) += ledtrig-tty.o ++obj-$(CONFIG_LEDS_TRIGGER_BLKDEV) += ledtrig-blkdev.o +diff --git a/drivers/leds/trigger/ledtrig-blkdev.c b/drivers/leds/trigger/ledtrig-blkdev.c +new file mode 100644 +index 000000000000..9e0c4b66ea27 +--- /dev/null ++++ b/drivers/leds/trigger/ledtrig-blkdev.c +@@ -0,0 +1,1218 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++ ++/* ++ * Block device LED trigger ++ * ++ * Copyright 2021-2023 Ian Pilcher ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++/** ++ * DOC: Overview ++ * ++ * The ``blkdev`` LED trigger works by periodically checking the activity ++ * counters of block devices that have been linked to one or more LEDs and ++ * blinking those LED(s) if the correct type of activity has occurred. The ++ * periodic check is scheduled with the Linux kernel's deferred work facility. ++ * ++ * Trigger-specific data about block devices and LEDs is stored in two data ++ * structures --- &struct blkdev_trig_bdev (a "BTB") and &struct blkdev_trig_led ++ * (a "BTL"). Each structure contains a &struct xarray that holds links to any ++ * linked devices of the other type. I.e. &blkdev_trig_bdev.linked_btls ++ * contains links to all BTLs whose LEDs have been linked to the BTB's block ++ * device, and &blkdev_trig_led.linked_btbs contains links to all BTBs whose ++ * block devices have been linked to the BTL's LED. Thus, a block device can ++ * be linked to more than one LED, and an LED can be linked to more than one ++ * block device. ++ */ ++ ++/* Default, minimum & maximum blink duration (milliseconds) */ ++#define BLKDEV_TRIG_BLINK_DEF 75 ++#define BLKDEV_TRIG_BLINK_MIN 10 ++#define BLKDEV_TRIG_BLINK_MAX 86400000 /* 24 hours */ ++ ++/* Default, minimum & maximum activity check interval (milliseconds) */ ++#define BLKDEV_TRIG_CHECK_DEF 100 ++#define BLKDEV_TRIG_CHECK_MIN 25 ++#define BLKDEV_TRIG_CHECK_MAX 86400000 /* 24 hours */ ++ ++/* ++ * If blkdev_trig_check() can't lock the mutex, how long to wait before trying ++ * again (milliseconds) ++ */ ++#define BLKDEV_TRIG_CHECK_RETRY 5 ++ ++/** ++ * struct blkdev_trig_bdev - Trigger-specific data about a block device. ++ * @last_checked: Time (in jiffies) at which the trigger last checked this ++ * block device for activity. ++ * @last_activity: Time (in jiffies) at which the trigger last detected ++ * activity of each type. ++ * @ios: Activity counter values for each type, corresponding to ++ * the timestamps in &last_activity. ++ * @index: &xarray index, so the BTB can be included in one or more ++ * &blkdev_trig_led.linked_btbs. ++ * @bdev: The block device. ++ * @linked_btls: The BTLs that represent the LEDs linked to the BTB's ++ * block device. ++ * ++ * Every block device linked to at least one LED gets a "BTB." A BTB is created ++ * when a block device that is not currently linked to any LEDs is linked to an ++ * LED. ++ * ++ * A BTB is freed when one of the following occurs: ++ * ++ * * The number of LEDs linked to the block device becomes zero, because it has ++ * been unlinked from its last LED using the trigger's &sysfs interface. ++ * ++ * * The number of LEDs linked to the block device becomes zero, because the ++ * last LED to which it was linked has been disassociated from the trigger ++ * (which happens automatically if the LED device is removed from the system). ++ * ++ * * The BTB's block device is removed from the system. To accomodate this ++ * scenario, BTB's are created as device resources, so that the release ++ * function will be called by the driver core when the device is removed. ++ */ ++struct blkdev_trig_bdev { ++ unsigned long last_checked; ++ unsigned long last_activity[NR_STAT_GROUPS]; ++ unsigned long ios[NR_STAT_GROUPS]; ++ unsigned long index; ++ struct block_device *bdev; ++ struct xarray linked_btls; ++}; ++ ++/** ++ * struct blkdev_trig_led - Trigger-specific data about an LED. ++ * @last_checked: Time (in jiffies) at which the trigger last checked the ++ * the block devices linked to this LED for activity. ++ * @index: &xarray index, so the BTL can be included in one or more ++ * &blkdev_trig_bdev.linked_btls. ++ * @mode: Bitmask for types of block device activity that will ++ * cause this LED to blink --- reads, writes, discards, ++ * etc. ++ * @led: The LED device. ++ * @blink_msec: Duration of a blink (milliseconds). ++ * @check_jiffies: Frequency with which block devices linked to this LED ++ * should be checked for activity (jiffies). ++ * @linked_btbs: The BTBs that represent the block devices linked to the ++ * BTL's LED. ++ * @all_btls_node: The BTL's node in the module's list of all BTLs. ++ * ++ * Every LED associated with the block device trigger gets a "BTL." A BTL is ++ * created when the trigger is "activated" on an LED (usually by writing ++ * ``blkdev`` to the LED's &sysfs &trigger attribute). A BTL is freed wnen its ++ * LED is disassociated from the trigger, either through the trigger's &sysfs ++ * interface or because the LED device is removed from the system. ++ */ ++struct blkdev_trig_led { ++ unsigned long last_checked; ++ unsigned long index; ++ unsigned long mode; /* must be ulong for atomic bit ops */ ++ struct led_classdev *led; ++ unsigned int blink_msec; ++ unsigned int check_jiffies; ++ struct xarray linked_btbs; ++ struct hlist_node all_btls_node; ++}; ++ ++/* Protects everything except atomic LED attributes */ ++static DEFINE_MUTEX(blkdev_trig_mutex); ++ ++/* BTB device resource release function */ ++static void blkdev_trig_btb_release(struct device *dev, void *res); ++ ++/* Index for next BTB or BTL */ ++static unsigned long blkdev_trig_next_index; ++ ++/* All LEDs associated with the trigger */ ++static HLIST_HEAD(blkdev_trig_all_btls); ++ ++/* Delayed work to periodically check for activity & blink LEDs */ ++static void blkdev_trig_check(struct work_struct *work); ++static DECLARE_DELAYED_WORK(blkdev_trig_work, blkdev_trig_check); ++ ++/* When is the delayed work scheduled to run next (jiffies) */ ++static unsigned long blkdev_trig_next_check; ++ ++/* Total number of BTB-to-BTL links */ ++static unsigned int blkdev_trig_link_count; ++ ++/* Empty sysfs attribute list for next 2 declarations */ ++static struct attribute *blkdev_trig_attrs_empty[] = { NULL }; ++ ++/* linked_leds sysfs directory for block devs linked to 1 or more LEDs */ ++static const struct attribute_group blkdev_trig_linked_leds = { ++ .name = "linked_leds", ++ .attrs = blkdev_trig_attrs_empty, ++}; ++ ++/* linked_devices sysfs directory for each LED associated with the trigger */ ++static const struct attribute_group blkdev_trig_linked_devs = { ++ .name = "linked_devices", ++ .attrs = blkdev_trig_attrs_empty, ++}; ++ ++ ++/* ++ * ++ * Delayed work to check for activity & blink LEDs ++ * ++ */ ++ ++/** ++ * blkdev_trig_blink() - Blink an LED, if the correct type of activity has ++ * occurred on the block device. ++ * @btl: The BTL that represents the LED ++ * @btb: The BTB that represents the block device ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ * Return: &true if the LED is blinked, &false if not. ++ */ ++static bool blkdev_trig_blink(const struct blkdev_trig_led *btl, ++ const struct blkdev_trig_bdev *btb) ++{ ++ unsigned long mode, mask, delay_on, delay_off; ++ enum stat_group i; ++ ++ mode = READ_ONCE(btl->mode); ++ ++ for (i = STAT_READ, mask = 1; i <= STAT_FLUSH; ++i, mask <<= 1) { ++ ++ if (!(mode & mask)) ++ continue; ++ ++ if (time_before_eq(btb->last_activity[i], btl->last_checked)) ++ continue; ++ ++ delay_on = READ_ONCE(btl->blink_msec); ++ delay_off = 1; /* 0 leaves LED turned on */ ++ ++ led_blink_set_oneshot(btl->led, &delay_on, &delay_off, 0); ++ return true; ++ } ++ ++ return false; ++} ++ ++/** ++ * blkdev_trig_update_btb() - Update a BTB's activity counters and timestamps. ++ * @btb: The BTB ++ * @now: Timestamp (in jiffies) ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_update_btb(struct blkdev_trig_bdev *btb, ++ unsigned long now) ++{ ++ unsigned long new_ios; ++ enum stat_group i; ++ ++ for (i = STAT_READ; i <= STAT_FLUSH; ++i) { ++ ++ new_ios = part_stat_read(btb->bdev, ios[i]); ++ ++ if (new_ios != btb->ios[i]) { ++ btb->ios[i] = new_ios; ++ btb->last_activity[i] = now; ++ } ++ } ++ ++ btb->last_checked = now; ++} ++ ++/** ++ * blkdev_trig_check() - Check linked devices for activity and blink LEDs. ++ * @work: Delayed work (&blkdev_trig_work) ++ * ++ * Context: Process context. Takes and releases &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_check(struct work_struct *work) ++{ ++ struct blkdev_trig_led *btl; ++ struct blkdev_trig_bdev *btb; ++ unsigned long index, delay, now, led_check, led_delay; ++ bool blinked; ++ ++ if (!mutex_trylock(&blkdev_trig_mutex)) { ++ delay = msecs_to_jiffies(BLKDEV_TRIG_CHECK_RETRY); ++ goto exit_reschedule; ++ } ++ ++ now = jiffies; ++ delay = ULONG_MAX; ++ ++ hlist_for_each_entry (btl, &blkdev_trig_all_btls, all_btls_node) { ++ ++ led_check = btl->last_checked + btl->check_jiffies; ++ ++ if (time_before_eq(led_check, now)) { ++ ++ blinked = false; ++ ++ xa_for_each (&btl->linked_btbs, index, btb) { ++ ++ if (btb->last_checked != now) ++ blkdev_trig_update_btb(btb, now); ++ if (!blinked) ++ blinked = blkdev_trig_blink(btl, btb); ++ } ++ ++ btl->last_checked = now; ++ led_delay = btl->check_jiffies; ++ ++ } else { ++ led_delay = led_check - now; ++ } ++ ++ if (led_delay < delay) ++ delay = led_delay; ++ } ++ ++ mutex_unlock(&blkdev_trig_mutex); ++ ++exit_reschedule: ++ WARN_ON_ONCE(delay == ULONG_MAX); ++ WARN_ON_ONCE(!schedule_delayed_work(&blkdev_trig_work, delay)); ++} ++ ++/** ++ * blkdev_trig_sched_led() - Set the schedule of the delayed work when a new ++ * LED is added to the schedule. ++ * @btl: The BTL that represents the LED ++ * ++ * Called when the number of block devices to which an LED is linked becomes ++ * non-zero. ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_sched_led(const struct blkdev_trig_led *btl) ++{ ++ unsigned long delay = READ_ONCE(btl->check_jiffies); ++ unsigned long check_by = jiffies + delay; ++ ++ /* ++ * If no other LED-to-block device links exist, simply schedule the ++ * delayed work according to this LED's check_interval attribute ++ * (check_jiffies). ++ */ ++ if (blkdev_trig_link_count == 0) { ++ WARN_ON(!schedule_delayed_work(&blkdev_trig_work, delay)); ++ blkdev_trig_next_check = check_by; ++ return; ++ } ++ ++ /* ++ * If the next check is already scheduled to occur soon enough to ++ * accomodate this LED's check_interval, the schedule doesn't need ++ * to be changed. ++ */ ++ if (time_after_eq(check_by, blkdev_trig_next_check)) ++ return; ++ ++ /* ++ * Modify the schedule, so that the delayed work runs soon enough for ++ * this LED. ++ */ ++ WARN_ON(!mod_delayed_work(system_wq, &blkdev_trig_work, delay)); ++ blkdev_trig_next_check = check_by; ++} ++ ++ ++/* ++ * ++ * Linking and unlinking LEDs and block devices ++ * ++ */ ++ ++/** ++ * blkdev_trig_link() - Link a block device to an LED. ++ * @btl: The BTL that represents the LED ++ * @btb: The BTB that represents the block device ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ * Return: &0 on success, negative &errno on error. ++ */ ++static int blkdev_trig_link(struct blkdev_trig_led *btl, ++ struct blkdev_trig_bdev *btb) ++{ ++ bool led_first_link; ++ int err; ++ ++ led_first_link = xa_empty(&btl->linked_btbs); ++ ++ err = xa_insert(&btb->linked_btls, btl->index, btl, GFP_KERNEL); ++ if (err) ++ return err; ++ ++ err = xa_insert(&btl->linked_btbs, btb->index, btb, GFP_KERNEL); ++ if (err) ++ goto error_erase_btl; ++ ++ /* Create /sys/class/block//linked_leds/ symlink */ ++ err = sysfs_add_link_to_group(bdev_kobj(btb->bdev), ++ blkdev_trig_linked_leds.name, ++ &btl->led->dev->kobj, btl->led->name); ++ if (err) ++ goto error_erase_btb; ++ ++ /* Create /sys/class/leds//linked_devices/ symlink */ ++ err = sysfs_add_link_to_group(&btl->led->dev->kobj, ++ blkdev_trig_linked_devs.name, ++ bdev_kobj(btb->bdev), ++ dev_name(&btb->bdev->bd_device)); ++ if (err) ++ goto error_remove_symlink; ++ ++ /* ++ * If this is the first block device linked to this LED, the delayed ++ * work schedule may need to be changed. ++ */ ++ if (led_first_link) ++ blkdev_trig_sched_led(btl); ++ ++ ++blkdev_trig_link_count; ++ ++ return 0; ++ ++error_remove_symlink: ++ sysfs_remove_link_from_group(bdev_kobj(btb->bdev), ++ blkdev_trig_linked_leds.name, ++ btl->led->name); ++error_erase_btb: ++ xa_erase(&btl->linked_btbs, btb->index); ++error_erase_btl: ++ xa_erase(&btb->linked_btls, btl->index); ++ return err; ++} ++ ++/** ++ * blkdev_trig_put_btb() - Remove and free a BTB, if it is no longer needed. ++ * @btb: The BTB ++ * ++ * Does nothing if the BTB (block device) is still linked to at least one LED. ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_put_btb(struct blkdev_trig_bdev *btb) ++{ ++ struct block_device *bdev = btb->bdev; ++ int err; ++ ++ if (xa_empty(&btb->linked_btls)) { ++ ++ sysfs_remove_group(bdev_kobj(bdev), &blkdev_trig_linked_leds); ++ err = devres_destroy(&bdev->bd_device, blkdev_trig_btb_release, ++ NULL, NULL); ++ WARN_ON(err); ++ } ++} ++ ++/** ++ * _blkdev_trig_unlink_always() - Perform the unconditionally required steps of ++ * unlinking a block device from an LED. ++ * @btl: The BTL that represents the LED ++ * @btb: The BTB that represents the block device ++ * ++ * When a block device is unlinked from an LED, certain steps must be performed ++ * only if the block device is **not** being released. This function performs ++ * those steps that are **always** required, whether or not the block device is ++ * being released. ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ */ ++static void _blkdev_trig_unlink_always(struct blkdev_trig_led *btl, ++ struct blkdev_trig_bdev *btb) ++{ ++ --blkdev_trig_link_count; ++ ++ if (blkdev_trig_link_count == 0) ++ WARN_ON(!cancel_delayed_work_sync(&blkdev_trig_work)); ++ ++ xa_erase(&btb->linked_btls, btl->index); ++ xa_erase(&btl->linked_btbs, btb->index); ++ ++ /* Remove /sys/class/leds//linked_devices/ symlink */ ++ sysfs_remove_link_from_group(&btl->led->dev->kobj, ++ blkdev_trig_linked_devs.name, ++ dev_name(&btb->bdev->bd_device)); ++} ++ ++/** ++ * blkdev_trig_unlink_norelease() - Unlink an LED from a block device that is ++ * **not** being released. ++ * @btl: The BTL that represents the LED. ++ * @btb: The BTB that represents the block device. ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_unlink_norelease(struct blkdev_trig_led *btl, ++ struct blkdev_trig_bdev *btb) ++{ ++ _blkdev_trig_unlink_always(btl, btb); ++ ++ /* Remove /sys/class/block//linked_leds/ symlink */ ++ sysfs_remove_link_from_group(bdev_kobj(btb->bdev), ++ blkdev_trig_linked_leds.name, ++ btl->led->name); ++ ++ blkdev_trig_put_btb(btb); ++} ++ ++/** ++ * blkdev_trig_unlink_release() - Unlink an LED from a block device that is ++ * being released. ++ * @btl: The BTL that represents the LED ++ * @btb: The BTB that represents the block device ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_unlink_release(struct blkdev_trig_led *btl, ++ struct blkdev_trig_bdev *btb) ++{ ++ _blkdev_trig_unlink_always(btl, btb); ++ ++ /* ++ * If the BTB is being released, the driver core has already removed the ++ * device's attribute groups, and the BTB will be freed automatically, ++ * so there's nothing else to do. ++ */ ++} ++ ++ ++/* ++ * ++ * BTB creation ++ * ++ */ ++ ++/** ++ * blkdev_trig_btb_release() - BTB device resource release function. ++ * @dev: The block device ++ * @res: The BTB ++ * ++ * Called by the driver core when a block device with a BTB is removed. ++ * ++ * Context: Process context. Takes and releases &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_btb_release(struct device *dev, void *res) ++{ ++ struct blkdev_trig_bdev *btb = res; ++ struct blkdev_trig_led *btl; ++ unsigned long index; ++ ++ mutex_lock(&blkdev_trig_mutex); ++ ++ xa_for_each (&btb->linked_btls, index, btl) ++ blkdev_trig_unlink_release(btl, btb); ++ ++ mutex_unlock(&blkdev_trig_mutex); ++} ++ ++/** ++ * blkdev_trig_get_bdev() - Get a block device by path. ++ * @path: The value written to an LED's &link_dev_by_path or ++ * &unlink_dev_by_path attribute, which should be the path to a ++ * special file that represents a block device ++ * @len: The number of characters in &path (not including its ++ * terminating null) ++ * ++ * The caller must call blkdev_put() when finished with the device. ++ * ++ * Context: Process context. ++ * Return: The block device, or an error pointer. ++ */ ++static struct block_device *blkdev_trig_get_bdev(const char *path, size_t len) ++{ ++ struct block_device *bdev; ++ char *buf; ++ ++ buf = kmemdup(path, len + 1, GFP_KERNEL); /* +1 to include null */ ++ if (buf == NULL) ++ return ERR_PTR(-ENOMEM); ++ ++ bdev = blkdev_get_by_path(strim(buf), 0, NULL, NULL); ++ kfree(buf); ++ return bdev; ++} ++ ++/** ++ * blkdev_trig_get_btb() - Find or create the BTB for a block device. ++ * @path: The value written to an LED's &link_dev_by_path attribute, ++ * which should be the path to a special file that represents a ++ * block device ++ * @len: The number of characters in &path ++ * ++ * If a new BTB is created, because the block device was not previously linked ++ * to any LEDs, the block device's &linked_leds &sysfs directory is created. ++ * ++ * Context: Process context. Caller must hold &blkdev_trig_mutex. ++ * Return: Pointer to the BTB, error pointer on error. ++ */ ++static struct blkdev_trig_bdev *blkdev_trig_get_btb(const char *path, ++ size_t len) ++{ ++ struct block_device *bdev; ++ struct blkdev_trig_bdev *btb; ++ int err; ++ ++ bdev = blkdev_trig_get_bdev(path, len); ++ if (IS_ERR(bdev)) ++ return ERR_CAST(bdev); ++ ++ btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release, ++ NULL, NULL); ++ if (btb != NULL) { ++ err = 0; ++ goto exit_put_bdev; ++ } ++ ++ if (blkdev_trig_next_index == ULONG_MAX) { ++ err = -EOVERFLOW; ++ goto exit_put_bdev; ++ } ++ ++ btb = devres_alloc(blkdev_trig_btb_release, sizeof(*btb), GFP_KERNEL); ++ if (btb == NULL) { ++ err = -ENOMEM; ++ goto exit_put_bdev; ++ } ++ ++ err = sysfs_create_group(bdev_kobj(bdev), &blkdev_trig_linked_leds); ++ if (err) ++ goto exit_free_btb; ++ ++ btb->index = blkdev_trig_next_index++; ++ btb->bdev = bdev; ++ xa_init(&btb->linked_btls); ++ ++ /* Populate BTB activity counters */ ++ blkdev_trig_update_btb(btb, jiffies); ++ ++ devres_add(&bdev->bd_device, btb); ++ ++exit_free_btb: ++ if (err) ++ devres_free(btb); ++exit_put_bdev: ++ blkdev_put(bdev, NULL); ++ return err ? ERR_PTR(err) : btb; ++} ++ ++ ++/* ++ * ++ * Activating and deactivating the trigger on an LED ++ * ++ */ ++ ++/** ++ * blkdev_trig_activate() - Called by the LEDs subsystem when an LED is ++ * associated with the trigger. ++ * @led: The LED ++ * ++ * Context: Process context. Takes and releases &blkdev_trig_mutex. ++ * Return: &0 on success, negative &errno on error. ++ */ ++static int blkdev_trig_activate(struct led_classdev *led) ++{ ++ struct blkdev_trig_led *btl; ++ int err; ++ ++ btl = kzalloc(sizeof(*btl), GFP_KERNEL); ++ if (btl == NULL) ++ return -ENOMEM; ++ ++ err = mutex_lock_interruptible(&blkdev_trig_mutex); ++ if (err) ++ goto exit_free; ++ ++ if (blkdev_trig_next_index == ULONG_MAX) { ++ err = -EOVERFLOW; ++ goto exit_unlock; ++ } ++ ++ btl->index = blkdev_trig_next_index++; ++ btl->last_checked = jiffies; ++ btl->mode = -1; /* set all bits */ ++ btl->led = led; ++ btl->blink_msec = BLKDEV_TRIG_BLINK_DEF; ++ btl->check_jiffies = msecs_to_jiffies(BLKDEV_TRIG_CHECK_DEF); ++ xa_init(&btl->linked_btbs); ++ ++ hlist_add_head(&btl->all_btls_node, &blkdev_trig_all_btls); ++ led_set_trigger_data(led, btl); ++ ++exit_unlock: ++ mutex_unlock(&blkdev_trig_mutex); ++exit_free: ++ if (err) ++ kfree(btl); ++ return err; ++} ++ ++/** ++ * blkdev_trig_deactivate() - Called by the the LEDs subsystem when an LED is ++ * disassociated from the trigger. ++ * @led: The LED ++ * ++ * The LEDs subsystem also calls this function when an LED associated with the ++ * trigger is removed or when the trigger is unregistered (if the module is ++ * unloaded). ++ * ++ * Context: Process context. Takes and releases &blkdev_trig_mutex. ++ */ ++static void blkdev_trig_deactivate(struct led_classdev *led) ++{ ++ struct blkdev_trig_led *btl = led_get_trigger_data(led); ++ struct blkdev_trig_bdev *btb; ++ unsigned long index; ++ ++ mutex_lock(&blkdev_trig_mutex); ++ ++ xa_for_each (&btl->linked_btbs, index, btb) ++ blkdev_trig_unlink_norelease(btl, btb); ++ ++ hlist_del(&btl->all_btls_node); ++ kfree(btl); ++ ++ mutex_unlock(&blkdev_trig_mutex); ++} ++ ++ ++/* ++ * ++ * Link-related attribute store functions ++ * ++ */ ++ ++/** ++ * link_dev_by_path_store() - &link_dev_by_path device attribute store function. ++ * @dev: The LED device ++ * @attr: The &link_dev_by_path attribute (&dev_attr_link_dev_by_path) ++ * @buf: The value written to the attribute, which should be the path to ++ * a special file that represents a block device to be linked to ++ * the LED (e.g. ``/dev/sda``) ++ * @count: The number of characters in &buf ++ * ++ * Context: Process context. Takes and releases &blkdev_trig_mutex. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t link_dev_by_path_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); ++ struct blkdev_trig_bdev *btb; ++ int err; ++ ++ err = mutex_lock_interruptible(&blkdev_trig_mutex); ++ if (err) ++ return err; ++ ++ btb = blkdev_trig_get_btb(buf, count); ++ if (IS_ERR(btb)) { ++ err = PTR_ERR(btb); ++ goto exit_unlock; ++ } ++ ++ if (xa_load(&btb->linked_btls, btl->index) != NULL) { ++ err = -EEXIST; ++ goto exit_put_btb; ++ } ++ ++ err = blkdev_trig_link(btl, btb); ++ ++exit_put_btb: ++ if (err) ++ blkdev_trig_put_btb(btb); ++exit_unlock: ++ mutex_unlock(&blkdev_trig_mutex); ++ return err ? : count; ++} ++ ++/** ++ * unlink_dev_by_path_store() - &unlink_dev_by_path device attribute store ++ * function. ++ * @dev: The LED device ++ * @attr: The &unlink_dev_by_path attribute (&dev_attr_unlink_dev_by_path) ++ * @buf: The value written to the attribute, which should be the path to ++ * a special file that represents a block device to be unlinked ++ * from the LED (e.g. ``/dev/sda``) ++ * @count: The number of characters in &buf ++ * ++ * Context: Process context. Takes and releases &blkdev_trig_mutex. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t unlink_dev_by_path_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); ++ struct block_device *bdev; ++ struct blkdev_trig_bdev *btb; ++ int err; ++ ++ bdev = blkdev_trig_get_bdev(buf, count); ++ if (IS_ERR(bdev)) ++ return PTR_ERR(bdev); ++ ++ err = mutex_lock_interruptible(&blkdev_trig_mutex); ++ if (err) ++ goto exit_put_bdev; ++ ++ btb = devres_find(&bdev->bd_device, blkdev_trig_btb_release, ++ NULL, NULL); ++ if (btb == NULL) { ++ err = -EUNATCH; /* bdev isn't linked to any LED */ ++ goto exit_unlock; ++ } ++ ++ if (xa_load(&btb->linked_btls, btl->index) == NULL) { ++ err = -EUNATCH; /* bdev isn't linked to this LED */ ++ goto exit_unlock; ++ } ++ ++ blkdev_trig_unlink_norelease(btl, btb); ++ ++exit_unlock: ++ mutex_unlock(&blkdev_trig_mutex); ++exit_put_bdev: ++ blkdev_put(bdev, NULL); ++ return err ? : count; ++} ++ ++/** ++ * unlink_dev_by_name_store() - &unlink_dev_by_name device attribute store ++ * function. ++ * @dev: The LED device ++ * @attr: The &unlink_dev_by_name attribute (&dev_attr_unlink_dev_by_name) ++ * @buf: The value written to the attribute, which should be the kernel ++ * name of a block device to be unlinked from the LED (e.g. ++ * ``sda``) ++ * @count: The number of characters in &buf ++ * ++ * Context: Process context. Takes and releases &blkdev_trig_mutex. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t unlink_dev_by_name_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); ++ struct blkdev_trig_bdev *btb; ++ unsigned long index; ++ int err; ++ ++ err = mutex_lock_interruptible(&blkdev_trig_mutex); ++ if (err) ++ return err; ++ ++ err = -EUNATCH; ++ ++ xa_for_each (&btl->linked_btbs, index, btb) { ++ ++ if (sysfs_streq(dev_name(&btb->bdev->bd_device), buf)) { ++ blkdev_trig_unlink_norelease(btl, btb); ++ err = 0; ++ break; ++ } ++ } ++ ++ mutex_unlock(&blkdev_trig_mutex); ++ return err ? : count; ++} ++ ++ ++/* ++ * ++ * Atomic attribute show & store functions ++ * ++ */ ++ ++/** ++ * blink_time_show() - &blink_time device attribute show function. ++ * @dev: The LED device ++ * @attr: The &blink_time attribute (&dev_attr_blink_time) ++ * @buf: Output buffer ++ * ++ * Writes the value of &blkdev_trig_led.blink_msec to &buf. ++ * ++ * Context: Process context. ++ * Return: The number of characters written to &buf. ++ */ ++static ssize_t blink_time_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ const struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); ++ ++ return sysfs_emit(buf, "%u\n", READ_ONCE(btl->blink_msec)); ++} ++ ++/** ++ * blink_time_store() - &blink_time device attribute store function. ++ * @dev: The LED device ++ * @attr: The &blink_time attribute (&dev_attr_blink_time) ++ * @buf: The new value (as written to the &sysfs attribute) ++ * @count: The number of characters in &buf ++ * ++ * Sets &blkdev_trig_led.blink_msec to the value in &buf. ++ * ++ * Context: Process context. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t blink_time_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); ++ unsigned int value; ++ int err; ++ ++ err = kstrtouint(buf, 0, &value); ++ if (err) ++ return err; ++ ++ if (value < BLKDEV_TRIG_BLINK_MIN || value > BLKDEV_TRIG_BLINK_MAX) ++ return -ERANGE; ++ ++ WRITE_ONCE(btl->blink_msec, value); ++ return count; ++} ++ ++/** ++ * check_interval_show() - &check_interval device attribute show function. ++ * @dev: The LED device ++ * @attr: The &check_interval attribute (&dev_attr_check_interval) ++ * @buf: Output buffer ++ * ++ * Writes the value of &blkdev_trig_led.check_jiffies (converted to ++ * milliseconds) to &buf. ++ * ++ * Context: Process context. ++ * Return: The number of characters written to &buf. ++ */ ++static ssize_t check_interval_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); ++ ++ return sysfs_emit(buf, "%u\n", ++ jiffies_to_msecs(READ_ONCE(btl->check_jiffies))); ++} ++ ++/** ++ * check_interval_store() - &check_interval device attribute store function ++ * @dev: The LED device ++ * @attr: The &check_interval attribute (&dev_attr_check_interval) ++ * @buf: The new value (as written to the &sysfs attribute) ++ * @count: The number of characters in &buf ++ * ++ * Sets &blkdev_trig_led.check_jiffies to the value in &buf (after converting ++ * from milliseconds). ++ * ++ * Context: Process context. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t check_interval_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct blkdev_trig_led *led = led_trigger_get_drvdata(dev); ++ unsigned int value; ++ int err; ++ ++ err = kstrtouint(buf, 0, &value); ++ if (err) ++ return err; ++ ++ if (value < BLKDEV_TRIG_CHECK_MIN || value > BLKDEV_TRIG_CHECK_MAX) ++ return -ERANGE; ++ ++ WRITE_ONCE(led->check_jiffies, msecs_to_jiffies(value)); ++ ++ return count; ++} ++ ++/** ++ * blkdev_trig_mode_show() - Helper for boolean attribute show functions. ++ * @led: The LED ++ * @buf: Output buffer ++ * @bit: Which bit to show ++ * ++ * Context: Process context. ++ * Return: The number of characters written to &buf. ++ */ ++static int blkdev_trig_mode_show(const struct blkdev_trig_led *led, char *buf, ++ enum stat_group bit) ++{ ++ return sysfs_emit(buf, ++ READ_ONCE(led->mode) & (1 << bit) ? "Y\n" : "N\n"); ++} ++ ++/** ++ * blkdev_trig_mode_store() - Helper for boolean attribute store functions. ++ * @led: The LED ++ * @buf: The new value (as written to the &sysfs attribute) ++ * @count: The number of characters in &buf ++ * @bit: Which bit to set ++ * ++ * Context: Process context. ++ * Return: &count on success, negative &errno on error. ++ */ ++static int blkdev_trig_mode_store(struct blkdev_trig_led *led, ++ const char *buf, size_t count, ++ enum stat_group bit) ++{ ++ bool set; ++ int err; ++ ++ err = kstrtobool(buf, &set); ++ if (err) ++ return err; ++ ++ if (set) ++ set_bit(bit, &led->mode); ++ else ++ clear_bit(bit, &led->mode); ++ ++ return count; ++} ++ ++/** ++ * blink_on_read_show() - &blink_on_read device attribute show function. ++ * @dev: The LED device ++ * @attr: The &blink_on_read attribute (&dev_attr_blink_on_read) ++ * @buf: Output buffer ++ * ++ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_READ bit in ++ * &blkdev_trig_led.mode is set or cleared. ++ * ++ * Context: Process context. ++ * Return: The number of characters written to &buf. ++ */ ++static ssize_t blink_on_read_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), ++ buf, STAT_READ); ++} ++ ++/** ++ * blink_on_read_store() - &blink_on_read device attribute store function. ++ * @dev: The LED device ++ * @attr: The &blink_on_read attribute (&dev_attr_blink_on_read) ++ * @buf: The new value (as written to the &sysfs attribute) ++ * @count: The number of characters in &buf ++ * ++ * Sets the &STAT_READ bit in &blkdev_trig_led.mode to the value in &buf ++ * (interpretted as a boolean). ++ * ++ * Context: Process context. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t blink_on_read_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), ++ buf, count, STAT_READ); ++} ++ ++/** ++ * blink_on_write_show() - &blink_on_write device attribute show function. ++ * @dev: The LED device ++ * @attr: The &blink_on_write attribute (&dev_attr_blink_on_write) ++ * @buf: Output buffer ++ * ++ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_WRITE bit in ++ * in &blkdev_trig_led.mode is set or cleared. ++ * ++ * Context: Process context. ++ * Return: The number of characters written to &buf. ++ */ ++static ssize_t blink_on_write_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), ++ buf, STAT_WRITE); ++} ++ ++/** ++ * blink_on_write_store() - &blink_on_write device attribute store function. ++ * @dev: The LED device ++ * @attr: The &blink_on_write attribute (&dev_attr_blink_on_write) ++ * @buf: The new value (as written to the &sysfs attribute) ++ * @count: The number of characters in &buf ++ * ++ * Sets the &STAT_WRITE bit in &blkdev_trig_led.mode to the value in &buf ++ * (interpretted as a boolean). ++ * ++ * Context: Process context. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t blink_on_write_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), ++ buf, count, STAT_WRITE); ++} ++ ++/** ++ * blink_on_flush_show() - &blink_on_flush device attribute show function. ++ * @dev: The LED device ++ * @attr: The &blink_on_flush attribute (&dev_attr_blink_on_flush) ++ * @buf: Output buffer ++ * ++ * Writes ``Y`` or ``N`` to &buf, depending whether the &STAT_FLUSH bit in ++ * &blkdev_trig_led.mode is set or cleared. ++ * ++ * Context: Process context. ++ * Return: The number of characters written to &buf. ++ */ ++static ssize_t blink_on_flush_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), ++ buf, STAT_FLUSH); ++} ++ ++/** ++ * blink_on_flush_store() - &blink_on_flush device attribute store function. ++ * @dev: The LED device ++ * @attr: The &blink_on_flush attribute (&dev_attr_blink_on_flush) ++ * @buf: The new value (as written to the &sysfs attribute) ++ * @count: The number of characters in &buf ++ * ++ * Sets the &STAT_FLUSH bit in &blkdev_trig_led.mode to the value in &buf ++ * (interpretted as a boolean). ++ * ++ * Context: Process context. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t blink_on_flush_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), ++ buf, count, STAT_FLUSH); ++} ++ ++/** ++ * blink_on_discard_show() - &blink_on_discard device attribute show function. ++ * @dev: The LED device ++ * @attr: The &blink_on_discard attribute (&dev_attr_blink_on_discard) ++ * @buf: Output buffer ++ * ++ * Writes ``Y`` or ``N`` to &buf, depending on whether the &STAT_DISCARD bit in ++ * &blkdev_trig_led.mode is set or cleared. ++ * ++ * Context: Process context. ++ * Return: The number of characters written to &buf. ++ */ ++static ssize_t blink_on_discard_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return blkdev_trig_mode_show(led_trigger_get_drvdata(dev), ++ buf, STAT_DISCARD); ++} ++ ++/** ++ * blink_on_discard_store() - &blink_on_discard device attribute store function. ++ * @dev: The LED device ++ * @attr: The &blink_on_discard attribute (&dev_attr_blink_on_discard) ++ * @buf: The new value (as written to the &sysfs attribute) ++ * @count: The number of characters in &buf ++ * ++ * Sets the &STAT_DISCARD bit in &blkdev_trig_led.mode to the value in &buf ++ * (interpretted as a boolean). ++ * ++ * Context: Process context. ++ * Return: &count on success, negative &errno on error. ++ */ ++static ssize_t blink_on_discard_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return blkdev_trig_mode_store(led_trigger_get_drvdata(dev), ++ buf, count, STAT_DISCARD); ++} ++ ++/* Device attributes */ ++static DEVICE_ATTR_WO(link_dev_by_path); ++static DEVICE_ATTR_WO(unlink_dev_by_path); ++static DEVICE_ATTR_WO(unlink_dev_by_name); ++static DEVICE_ATTR_RW(blink_time); ++static DEVICE_ATTR_RW(check_interval); ++static DEVICE_ATTR_RW(blink_on_read); ++static DEVICE_ATTR_RW(blink_on_write); ++static DEVICE_ATTR_RW(blink_on_flush); ++static DEVICE_ATTR_RW(blink_on_discard); ++ ++/* Device attributes in LED directory (/sys/class/leds//...) */ ++static struct attribute *blkdev_trig_attrs[] = { ++ &dev_attr_link_dev_by_path.attr, ++ &dev_attr_unlink_dev_by_path.attr, ++ &dev_attr_unlink_dev_by_name.attr, ++ &dev_attr_blink_time.attr, ++ &dev_attr_check_interval.attr, ++ &dev_attr_blink_on_read.attr, ++ &dev_attr_blink_on_write.attr, ++ &dev_attr_blink_on_flush.attr, ++ &dev_attr_blink_on_discard.attr, ++ NULL ++}; ++ ++/* Unnamed attribute group == no subdirectory */ ++static const struct attribute_group blkdev_trig_attr_group = { ++ .attrs = blkdev_trig_attrs, ++}; ++ ++/* Attribute groups for the trigger */ ++static const struct attribute_group *blkdev_trig_attr_groups[] = { ++ &blkdev_trig_attr_group, /* /sys/class/leds//... */ ++ &blkdev_trig_linked_devs, /* /sys/class/leds//linked_devices/ */ ++ NULL ++}; ++ ++/* Trigger registration data */ ++static struct led_trigger blkdev_trig_trigger = { ++ .name = "blkdev", ++ .activate = blkdev_trig_activate, ++ .deactivate = blkdev_trig_deactivate, ++ .groups = blkdev_trig_attr_groups, ++}; ++ ++/** ++ * blkdev_trig_init() - Block device LED trigger initialization. ++ * ++ * Registers the ``blkdev`` LED trigger. ++ * ++ * Return: &0 on success, negative &errno on failure. ++ */ ++static int __init blkdev_trig_init(void) ++{ ++ return led_trigger_register(&blkdev_trig_trigger); ++} ++module_init(blkdev_trig_init); ++ ++/** ++ * blkdev_trig_exit() - Block device LED trigger module exit. ++ * ++ * Unregisters the ``blkdev`` LED trigger. ++ */ ++static void __exit blkdev_trig_exit(void) ++{ ++ led_trigger_unregister(&blkdev_trig_trigger); ++} ++module_exit(blkdev_trig_exit); ++ ++MODULE_DESCRIPTION("Block device LED trigger"); ++MODULE_AUTHOR("Ian Pilcher "); ++MODULE_LICENSE("GPL v2"); +diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c +index 4dff656af3ad..74241b2ff21e 100644 +--- a/drivers/pinctrl/pinctrl-amd.c ++++ b/drivers/pinctrl/pinctrl-amd.c +@@ -748,7 +748,7 @@ static int amd_pinconf_get(struct pinctrl_dev *pctldev, + break; + + default: +- dev_err(&gpio_dev->pdev->dev, "Invalid config param %04x\n", ++ dev_dbg(&gpio_dev->pdev->dev, "Invalid config param %04x\n", + param); + return -ENOTSUPP; + } +@@ -798,7 +798,7 @@ static int amd_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, + break; + + default: +- dev_err(&gpio_dev->pdev->dev, ++ dev_dbg(&gpio_dev->pdev->dev, + "Invalid config param %04x\n", param); + ret = -ENOTSUPP; + } diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h -index e83c4c0950417..21b8dfa5d8286 100644 +index e83c4c095041..21b8dfa5d828 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -48,7 +48,7 @@ extern unsigned int pageblock_order; @@ -10280,7 +14046,7 @@ index e83c4c0950417..21b8dfa5d8286 100644 #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/kernel/padata.c b/kernel/padata.c -index 222d60195de66..b8e6b7c48746e 100644 +index 222d60195de6..b8e6b7c48746 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -45,7 +45,7 @@ struct padata_mt_job_state { @@ -10302,7 +14068,7 @@ index 222d60195de66..b8e6b7c48746e 100644 struct padata_work *pw = container_of(w, struct padata_work, pw_work); struct padata_mt_job_state *ps = pw->pw_data; diff --git a/mm/readahead.c b/mm/readahead.c -index a9c999aa19af6..797494cec4903 100644 +index a9c999aa19af..797494cec490 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -613,9 +613,17 @@ static void ondemand_readahead(struct readahead_control *ractl, @@ -10324,8 +14090,21 @@ index a9c999aa19af6..797494cec4903 100644 ra->start = start; ra->size = start - index; /* old async_size */ ra->size += req_size; +diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o +index 0edfdb40364b..ae52d3b3f063 100644 +--- a/scripts/Makefile.vmlinux_o ++++ b/scripts/Makefile.vmlinux_o +@@ -19,7 +19,7 @@ quiet_cmd_gen_initcalls_lds = GEN $@ + + .tmp_initcalls.lds: $(srctree)/scripts/generate_initcall_order.pl \ + vmlinux.a $(KBUILD_VMLINUX_LIBS) FORCE +- $(call if_changed,gen_initcalls_lds) ++ +$(call if_changed,gen_initcalls_lds) + + targets := .tmp_initcalls.lds + diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c -index ce5faa6205170..1f0f2b8df3005 100644 +index ce5faa620517..1f0f2b8df300 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -1235,7 +1235,7 @@ static int cs35l41_no_acpi_dsd(struct cs35l41_hda *cs35l41, struct device *physd @@ -10338,12 +14117,12 @@ index ce5faa6205170..1f0f2b8df3005 100644 hw_cfg->gpio1.func = CS35l41_VSPK_SWITCH; hw_cfg->gpio1.valid = true; -- -2.41.0 +2.42.0 -From 4b328fcd2f946e4a517cd7f562482a5f0c9bbe04 Mon Sep 17 00:00:00 2001 +From e4895406f7f12e8bed1293c24931803abb1915c1 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 10 Jul 2023 17:10:25 +0200 -Subject: [PATCH 5/6] ksm +Subject: [PATCH 6/7] ksm Signed-off-by: Peter Jung --- @@ -10380,7 +14159,7 @@ Signed-off-by: Peter Jung 30 files changed, 390 insertions(+), 18 deletions(-) diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst -index 7626392fe82cb..5c5be7bd84b81 100644 +index 7626392fe82c..5c5be7bd84b8 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -173,6 +173,13 @@ stable_node_chains @@ -10431,7 +14210,7 @@ index 7626392fe82cb..5c5be7bd84b81 100644 From the perspective of application, a high ratio of ``ksm_rmap_items`` to ``ksm_merging_pages`` means a bad madvise-applied policy, so developers or diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl -index 1f13995d00d7b..4a5bc2a91fa74 100644 +index 1f13995d00d7..4a5bc2a91fa7 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -491,3 +491,6 @@ @@ -10442,7 +14221,7 @@ index 1f13995d00d7b..4a5bc2a91fa74 100644 +563 common process_ksm_disable sys_process_ksm_disable +564 common process_ksm_status sys_process_ksm_status diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl -index 8ebed8a138747..d616dcc060df3 100644 +index 8ebed8a13874..d616dcc060df 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -465,3 +465,6 @@ @@ -10453,7 +14232,7 @@ index 8ebed8a138747..d616dcc060df3 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h -index 64a514f90131b..63a8a9c4abc16 100644 +index 64a514f90131..63a8a9c4abc1 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -39,7 +39,7 @@ @@ -10466,7 +14245,7 @@ index 64a514f90131b..63a8a9c4abc16 100644 #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h -index d952a28463e01..c99c8260489b8 100644 +index d952a28463e0..c99c8260489b 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -909,6 +909,12 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv) @@ -10483,7 +14262,7 @@ index d952a28463e01..c99c8260489b8 100644 /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl -index f8c74ffeeefbe..735157909c6fb 100644 +index f8c74ffeeefb..735157909c6f 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -372,3 +372,6 @@ @@ -10494,7 +14273,7 @@ index f8c74ffeeefbe..735157909c6fb 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl -index 4f504783371fc..25b22d311f108 100644 +index 4f504783371f..25b22d311f10 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -451,3 +451,6 @@ @@ -10505,7 +14284,7 @@ index 4f504783371fc..25b22d311f108 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl -index 858d22bf275c2..e548c182a33ef 100644 +index 858d22bf275c..e548c182a33e 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -457,3 +457,6 @@ @@ -10516,7 +14295,7 @@ index 858d22bf275c2..e548c182a33ef 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl -index 1976317d4e8b0..fed21167be444 100644 +index 1976317d4e8b..fed21167be44 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -390,3 +390,6 @@ @@ -10527,7 +14306,7 @@ index 1976317d4e8b0..fed21167be444 100644 +453 n32 process_ksm_disable sys_process_ksm_disable +454 n32 process_ksm_status sys_process_ksm_status diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl -index cfda2511badf3..b27ae871f676f 100644 +index cfda2511badf..b27ae871f676 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -366,3 +366,6 @@ @@ -10538,7 +14317,7 @@ index cfda2511badf3..b27ae871f676f 100644 +453 n64 process_ksm_disable sys_process_ksm_disable +454 n64 process_ksm_status sys_process_ksm_status diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl -index 7692234c37683..59f298413c292 100644 +index 7692234c3768..59f298413c29 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -439,3 +439,6 @@ @@ -10549,7 +14328,7 @@ index 7692234c37683..59f298413c292 100644 +453 o32 process_ksm_disable sys_process_ksm_disable +454 o32 process_ksm_status sys_process_ksm_status diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl -index a0a9145b6dd4f..494b59d1185fa 100644 +index a0a9145b6dd4..494b59d1185f 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -450,3 +450,6 @@ @@ -10560,7 +14339,7 @@ index a0a9145b6dd4f..494b59d1185fa 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl -index 8c0b08b7a80ec..499d7b233a431 100644 +index 8c0b08b7a80e..499d7b233a43 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -538,3 +538,6 @@ @@ -10571,7 +14350,7 @@ index 8c0b08b7a80ec..499d7b233a431 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl -index a6935af2235ca..97b36ce151556 100644 +index a6935af2235c..97b36ce15155 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -454,3 +454,6 @@ @@ -10582,7 +14361,7 @@ index a6935af2235ca..97b36ce151556 100644 +453 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status sys_process_ksm_status diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl -index 97377e8c50251..bd3827e1fc8d9 100644 +index 97377e8c5025..bd3827e1fc8d 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -454,3 +454,6 @@ @@ -10593,7 +14372,7 @@ index 97377e8c50251..bd3827e1fc8d9 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl -index faa835f3c54a5..c05e62a0ca026 100644 +index faa835f3c54a..c05e62a0ca02 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -497,3 +497,6 @@ @@ -10604,7 +14383,7 @@ index faa835f3c54a5..c05e62a0ca026 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl -index bc0a3c941b35c..c79bd2dd758da 100644 +index bc0a3c941b35..c79bd2dd758d 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -456,3 +456,6 @@ @@ -10615,7 +14394,7 @@ index bc0a3c941b35c..c79bd2dd758da 100644 +453 i386 process_ksm_disable sys_process_ksm_disable +454 i386 process_ksm_status sys_process_ksm_status diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl -index 227538b0ce801..e146a70cc299f 100644 +index 227538b0ce80..e146a70cc299 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -373,6 +373,9 @@ @@ -10629,7 +14408,7 @@ index 227538b0ce801..e146a70cc299f 100644 # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl -index 2b69c3c035b6a..b7bf81a3ba133 100644 +index 2b69c3c035b6..b7bf81a3ba13 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -422,3 +422,6 @@ @@ -10640,7 +14419,7 @@ index 2b69c3c035b6a..b7bf81a3ba133 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/fs/proc/base.c b/fs/proc/base.c -index 9df3f48396628..0fedd00505771 100644 +index 9df3f4839662..0fedd0050577 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, @@ -10652,7 +14431,7 @@ index 9df3f48396628..0fedd00505771 100644 seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); diff --git a/include/linux/ksm.h b/include/linux/ksm.h -index 899a314bc4872..c2dd786a30e1f 100644 +index 899a314bc487..c2dd786a30e1 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -26,6 +26,22 @@ int ksm_disable(struct mm_struct *mm); @@ -10690,7 +14469,7 @@ index 899a314bc4872..c2dd786a30e1f 100644 static inline void collect_procs_ksm(struct page *page, struct list_head *to_kill, int force_early) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index 5e74ce4a28cd6..51d04c1847c11 100644 +index 7d30dc4ff0ff..d8d8cc1348d6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -812,7 +812,7 @@ struct mm_struct { @@ -10717,7 +14496,7 @@ index 5e74ce4a28cd6..51d04c1847c11 100644 struct { /* this mm_struct is on lru_gen_mm_list */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h -index 03e3d0121d5e3..16597dea90f40 100644 +index 03e3d0121d5e..16597dea90f4 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -813,6 +813,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); @@ -10731,7 +14510,7 @@ index 03e3d0121d5e3..16597dea90f40 100644 unsigned long prot, unsigned long pgoff, unsigned long flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h -index fd6c1cb585db4..11d0fc82c4378 100644 +index fd6c1cb585db..11d0fc82c437 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -820,8 +820,17 @@ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) @@ -10754,7 +14533,7 @@ index fd6c1cb585db4..11d0fc82c4378 100644 /* * 32 bit systems traditionally used different diff --git a/kernel/sys.c b/kernel/sys.c -index 2410e3999ebe5..b0841a2dd2b7a 100644 +index 2410e3999ebe..b0841a2dd2b7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2727,6 +2727,153 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, @@ -10912,7 +14691,7 @@ index 2410e3999ebe5..b0841a2dd2b7a 100644 struct getcpu_cache __user *, unused) { diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c -index 781de7cc6a4e1..49a35d35d0f97 100644 +index 781de7cc6a4e..49a35d35d0f9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -184,6 +184,9 @@ COND_SYSCALL(mincore); @@ -10926,7 +14705,7 @@ index 781de7cc6a4e1..49a35d35d0f97 100644 COND_SYSCALL(mbind); COND_SYSCALL(get_mempolicy); diff --git a/mm/khugepaged.c b/mm/khugepaged.c -index 78c8d5d8b6284..4b8b8673d5d9f 100644 +index 78c8d5d8b628..4b8b8673d5d9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -19,6 +19,7 @@ @@ -10946,7 +14725,7 @@ index 78c8d5d8b6284..4b8b8673d5d9f 100644 } else { src_page = pte_page(pteval); diff --git a/mm/ksm.c b/mm/ksm.c -index d20d7662419be..74804158ee02d 100644 +index d7b5b95e936e..6b7b8928fb96 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -278,6 +278,9 @@ static unsigned int zero_checksum __read_mostly; @@ -10969,7 +14748,7 @@ index d20d7662419be..74804158ee02d 100644 pte_unmap_unlock(pte, ptl); return ret; } -@@ -1222,8 +1226,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, +@@ -1229,8 +1233,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, page_add_anon_rmap(kpage, vma, addr, RMAP_NONE); newpte = mk_pte(kpage, vma->vm_page_prot); } else { @@ -10986,7 +14765,7 @@ index d20d7662419be..74804158ee02d 100644 /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we -@@ -3084,7 +3094,7 @@ static void wait_while_offlining(void) +@@ -3091,7 +3101,7 @@ static void wait_while_offlining(void) #ifdef CONFIG_PROC_FS long ksm_process_profit(struct mm_struct *mm) { @@ -10995,7 +14774,7 @@ index d20d7662419be..74804158ee02d 100644 mm->ksm_rmap_items * sizeof(struct ksm_rmap_item); } #endif /* CONFIG_PROC_FS */ -@@ -3353,12 +3363,19 @@ static ssize_t pages_volatile_show(struct kobject *kobj, +@@ -3360,12 +3370,19 @@ static ssize_t pages_volatile_show(struct kobject *kobj, } KSM_ATTR_RO(pages_volatile); @@ -11016,7 +14795,7 @@ index d20d7662419be..74804158ee02d 100644 ksm_rmap_items * sizeof(struct ksm_rmap_item); return sysfs_emit(buf, "%ld\n", general_profit); -@@ -3420,6 +3437,7 @@ static struct attribute *ksm_attrs[] = { +@@ -3427,6 +3444,7 @@ static struct attribute *ksm_attrs[] = { &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, @@ -11025,7 +14804,7 @@ index d20d7662419be..74804158ee02d 100644 #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, diff --git a/mm/memory.c b/mm/memory.c -index 1ec1ef3418bf5..014dd58b3ffe9 100644 +index cdc4d4c1c858..428943ecda25 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1433,8 +1433,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -11049,7 +14828,7 @@ index 1ec1ef3418bf5..014dd58b3ffe9 100644 } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c -index 26853badae705..0de9d33cd565d 100644 +index 26853badae70..0de9d33cd565 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -29,6 +29,8 @@ @@ -11188,12 +14967,12 @@ index 26853badae705..0de9d33cd565d 100644 #ifdef __NR_userfaultfd test_unmerge_uffd_wp(); -- -2.41.0 +2.42.0 -From e376a8aadd07d72875ff77bfc6c3d2ba9ac549bd Mon Sep 17 00:00:00 2001 +From 49274c8196e04f14f8af83a59ff82e2ae00ac21b Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 10 Jul 2023 17:11:55 +0200 -Subject: [PATCH 6/6] zstd +Subject: [PATCH 7/7] zstd Signed-off-by: Peter Jung --- @@ -11260,7 +15039,7 @@ Signed-off-by: Peter Jung create mode 100644 lib/zstd/common/bits.h diff --git a/include/linux/zstd.h b/include/linux/zstd.h -index 113408eef6ece..f109d49f43f80 100644 +index 113408eef6ec..f109d49f43f8 100644 --- a/include/linux/zstd.h +++ b/include/linux/zstd.h @@ -1,6 +1,6 @@ @@ -11272,7 +15051,7 @@ index 113408eef6ece..f109d49f43f80 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h -index 58b6dd45a969f..6d5cf55f0bf3e 100644 +index 58b6dd45a969..6d5cf55f0bf3 100644 --- a/include/linux/zstd_errors.h +++ b/include/linux/zstd_errors.h @@ -1,5 +1,6 @@ @@ -11338,7 +15117,7 @@ index 58b6dd45a969f..6d5cf55f0bf3e 100644 } ZSTD_ErrorCode; diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h -index 79d55465d5c1d..8b4ffe649df57 100644 +index 79d55465d5c1..8b4ffe649df5 100644 --- a/include/linux/zstd_lib.h +++ b/include/linux/zstd_lib.h @@ -1,5 +1,6 @@ @@ -12530,7 +16309,7 @@ index 79d55465d5c1d..8b4ffe649df57 100644 #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile -index 20f08c644b71a..464c410b2768c 100644 +index 20f08c644b71..464c410b2768 100644 --- a/lib/zstd/Makefile +++ b/lib/zstd/Makefile @@ -1,6 +1,6 @@ @@ -12543,7 +16322,7 @@ index 20f08c644b71a..464c410b2768c 100644 # This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h new file mode 100644 -index 0000000000000..05adbbeccaa9b +index 000000000000..05adbbeccaa9 --- /dev/null +++ b/lib/zstd/common/allocations.h @@ -0,0 +1,56 @@ @@ -12605,7 +16384,7 @@ index 0000000000000..05adbbeccaa9b +#endif /* ZSTD_ALLOCATIONS_H */ diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h new file mode 100644 -index 0000000000000..aa3487ec4b6a7 +index 000000000000..aa3487ec4b6a --- /dev/null +++ b/lib/zstd/common/bits.h @@ -0,0 +1,149 @@ @@ -12759,7 +16538,7 @@ index 0000000000000..aa3487ec4b6a7 + +#endif /* ZSTD_BITS_H */ diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h -index feef3a1b1d600..444dc4f85c649 100644 +index feef3a1b1d60..444dc4f85c64 100644 --- a/lib/zstd/common/bitstream.h +++ b/lib/zstd/common/bitstream.h @@ -1,7 +1,8 @@ @@ -12886,7 +16665,7 @@ index feef3a1b1d600..444dc4f85c649 100644 if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ return BIT_DStream_overflow; diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h -index c42d39faf9bd8..c437e09755750 100644 +index c42d39faf9bd..c437e0975575 100644 --- a/lib/zstd/common/compiler.h +++ b/lib/zstd/common/compiler.h @@ -1,5 +1,6 @@ @@ -12916,7 +16695,7 @@ index c42d39faf9bd8..c437e09755750 100644 #endif /* ZSTD_COMPILER_H */ diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h -index 0db7b42407eea..d8319a2bef4ce 100644 +index 0db7b42407ee..d8319a2bef4c 100644 --- a/lib/zstd/common/cpu.h +++ b/lib/zstd/common/cpu.h @@ -1,5 +1,6 @@ @@ -12928,7 +16707,7 @@ index 0db7b42407eea..d8319a2bef4ce 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c -index bb863c9ea6164..e56ff6464e918 100644 +index bb863c9ea616..e56ff6464e91 100644 --- a/lib/zstd/common/debug.c +++ b/lib/zstd/common/debug.c @@ -1,7 +1,8 @@ @@ -12942,7 +16721,7 @@ index bb863c9ea6164..e56ff6464e918 100644 * You can contact the author at : * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h -index 6dd88d1fbd02c..da0dbfc614b88 100644 +index 6dd88d1fbd02..da0dbfc614b8 100644 --- a/lib/zstd/common/debug.h +++ b/lib/zstd/common/debug.h @@ -1,7 +1,8 @@ @@ -12956,7 +16735,7 @@ index 6dd88d1fbd02c..da0dbfc614b88 100644 * You can contact the author at : * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c -index fef67056f0524..6cdd82233fb59 100644 +index fef67056f052..6cdd82233fb5 100644 --- a/lib/zstd/common/entropy_common.c +++ b/lib/zstd/common/entropy_common.c @@ -1,6 +1,7 @@ @@ -13074,7 +16853,7 @@ index fef67056f0524..6cdd82233fb59 100644 return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); } diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c -index 6d1135f8c3733..a4062d30d1703 100644 +index 6d1135f8c373..a4062d30d170 100644 --- a/lib/zstd/common/error_private.c +++ b/lib/zstd/common/error_private.c @@ -1,5 +1,6 @@ @@ -13122,7 +16901,7 @@ index 6d1135f8c3733..a4062d30d1703 100644 default: return notErrorCode; } diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h -index ca5101e542faa..9a4699a38a881 100644 +index ca5101e542fa..9a4699a38a88 100644 --- a/lib/zstd/common/error_private.h +++ b/lib/zstd/common/error_private.h @@ -1,5 +1,6 @@ @@ -13134,7 +16913,7 @@ index ca5101e542faa..9a4699a38a881 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h -index 4507043b2287c..c4e25a2191429 100644 +index 4507043b2287..c4e25a219142 100644 --- a/lib/zstd/common/fse.h +++ b/lib/zstd/common/fse.h @@ -1,7 +1,8 @@ @@ -13286,7 +17065,7 @@ index 4507043b2287c..c4e25a2191429 100644 * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c -index a0d06095be83d..45cf457f31ef8 100644 +index a0d06095be83..45cf457f31ef 100644 --- a/lib/zstd/common/fse_decompress.c +++ b/lib/zstd/common/fse_decompress.c @@ -1,6 +1,7 @@ @@ -13446,7 +17225,7 @@ index a0d06095be83d..45cf457f31ef8 100644 - #endif /* FSE_COMMONDEFS_ONLY */ diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h -index 5042ff8703087..8e7943092ed1a 100644 +index 5042ff870308..8e7943092ed1 100644 --- a/lib/zstd/common/huf.h +++ b/lib/zstd/common/huf.h @@ -1,7 +1,8 @@ @@ -13773,7 +17552,7 @@ index 5042ff8703087..8e7943092ed1a 100644 +#endif /* HUF_H_298734234 */ diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h -index 1d9cc03924ca9..a7231822b6e32 100644 +index 1d9cc03924ca..a7231822b6e3 100644 --- a/lib/zstd/common/mem.h +++ b/lib/zstd/common/mem.h @@ -1,6 +1,6 @@ @@ -13785,7 +17564,7 @@ index 1d9cc03924ca9..a7231822b6e32 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h -index 0e3b2c0a527db..7ede8cf1ffe57 100644 +index 0e3b2c0a527d..7ede8cf1ffe5 100644 --- a/lib/zstd/common/portability_macros.h +++ b/lib/zstd/common/portability_macros.h @@ -1,5 +1,6 @@ @@ -13839,7 +17618,7 @@ index 0e3b2c0a527db..7ede8cf1ffe57 100644 + #endif /* ZSTD_PORTABILITY_MACROS_H */ diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c -index 3d7e35b309b5d..44b95b25344a1 100644 +index 3d7e35b309b5..44b95b25344a 100644 --- a/lib/zstd/common/zstd_common.c +++ b/lib/zstd/common/zstd_common.c @@ -1,5 +1,6 @@ @@ -13897,7 +17676,7 @@ index 3d7e35b309b5d..44b95b25344a1 100644 - } -} diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h -index 2c34e8a33a1c1..670c5fa2a952d 100644 +index 2c34e8a33a1c..670c5fa2a952 100644 --- a/lib/zstd/common/zstd_deps.h +++ b/lib/zstd/common/zstd_deps.h @@ -1,6 +1,6 @@ @@ -13931,7 +17710,7 @@ index 2c34e8a33a1c1..670c5fa2a952d 100644 +#endif /* ZSTD_DEPS_STDINT */ +#endif /* ZSTD_DEPS_NEED_STDINT */ diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h -index 93305d9b41bba..7f023e4d47740 100644 +index 93305d9b41bb..7f023e4d4774 100644 --- a/lib/zstd/common/zstd_internal.h +++ b/lib/zstd/common/zstd_internal.h @@ -1,5 +1,6 @@ @@ -14117,7 +17896,7 @@ index 93305d9b41bba..7f023e4d47740 100644 /* ZSTD_invalidateRepCodes() : diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h -index d9a76112ec3af..6ab8be6532efc 100644 +index d9a76112ec3a..6ab8be6532ef 100644 --- a/lib/zstd/compress/clevels.h +++ b/lib/zstd/compress/clevels.h @@ -1,5 +1,6 @@ @@ -14129,7 +17908,7 @@ index d9a76112ec3af..6ab8be6532efc 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c -index ec5b1ca6d71af..e46ca6621b488 100644 +index ec5b1ca6d71a..e46ca6621b48 100644 --- a/lib/zstd/compress/fse_compress.c +++ b/lib/zstd/compress/fse_compress.c @@ -1,6 +1,7 @@ @@ -14248,7 +18027,7 @@ index ec5b1ca6d71af..e46ca6621b488 100644 - #endif /* FSE_COMMONDEFS_ONLY */ diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c -index 3ddc6dfb68948..0b12587cc14b1 100644 +index 3ddc6dfb6894..0b12587cc14b 100644 --- a/lib/zstd/compress/hist.c +++ b/lib/zstd/compress/hist.c @@ -1,7 +1,8 @@ @@ -14262,7 +18041,7 @@ index 3ddc6dfb68948..0b12587cc14b1 100644 * You can contact the author at : * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h -index fc1830abc9c63..f7687b0fc20a0 100644 +index fc1830abc9c6..f7687b0fc20a 100644 --- a/lib/zstd/compress/hist.h +++ b/lib/zstd/compress/hist.h @@ -1,7 +1,8 @@ @@ -14276,7 +18055,7 @@ index fc1830abc9c63..f7687b0fc20a0 100644 * You can contact the author at : * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c -index 74ef0db476210..83241abafe35e 100644 +index 74ef0db47621..83241abafe35 100644 --- a/lib/zstd/compress/huf_compress.c +++ b/lib/zstd/compress/huf_compress.c @@ -1,6 +1,7 @@ @@ -15033,7 +18812,7 @@ index 74ef0db476210..83241abafe35e 100644 } - diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c -index f620cafca633b..c1c316e9e289f 100644 +index f620cafca633..c1c316e9e289 100644 --- a/lib/zstd/compress/zstd_compress.c +++ b/lib/zstd/compress/zstd_compress.c @@ -1,5 +1,6 @@ @@ -18415,7 +22194,7 @@ index f620cafca633b..c1c316e9e289f 100644 + } +} diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h -index 71697a11ae305..899f5e2de8e96 100644 +index 71697a11ae30..899f5e2de8e9 100644 --- a/lib/zstd/compress/zstd_compress_internal.h +++ b/lib/zstd/compress/zstd_compress_internal.h @@ -1,5 +1,6 @@ @@ -18969,7 +22748,7 @@ index 71697a11ae305..899f5e2de8e96 100644 + #endif /* ZSTD_COMPRESS_H */ diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c -index 52b0a8059aba9..3e9ea46a670a6 100644 +index 52b0a8059aba..3e9ea46a670a 100644 --- a/lib/zstd/compress/zstd_compress_literals.c +++ b/lib/zstd/compress/zstd_compress_literals.c @@ -1,5 +1,6 @@ @@ -19211,7 +22990,7 @@ index 52b0a8059aba9..3e9ea46a670a6 100644 MEM_writeLE32(ostart, lhc); ostart[4] = (BYTE)(cLitSize >> 10); diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h -index 9775fb97cb702..a2a85d6b69e53 100644 +index 9775fb97cb70..a2a85d6b69e5 100644 --- a/lib/zstd/compress/zstd_compress_literals.h +++ b/lib/zstd/compress/zstd_compress_literals.h @@ -1,5 +1,6 @@ @@ -19255,7 +23034,7 @@ index 9775fb97cb702..a2a85d6b69e53 100644 #endif /* ZSTD_COMPRESS_LITERALS_H */ diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c -index 21ddc1b37acf8..5c028c78d889b 100644 +index 21ddc1b37acf..5c028c78d889 100644 --- a/lib/zstd/compress/zstd_compress_sequences.c +++ b/lib/zstd/compress/zstd_compress_sequences.c @@ -1,5 +1,6 @@ @@ -19285,7 +23064,7 @@ index 21ddc1b37acf8..5c028c78d889b 100644 * If basic encoding isn't possible, always choose RLE. */ diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h -index 7991364c2f71f..7fe6f4ff5cf25 100644 +index 7991364c2f71..7fe6f4ff5cf2 100644 --- a/lib/zstd/compress/zstd_compress_sequences.h +++ b/lib/zstd/compress/zstd_compress_sequences.h @@ -1,5 +1,6 @@ @@ -19297,7 +23076,7 @@ index 7991364c2f71f..7fe6f4ff5cf25 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c -index 17d836cc84e8f..dbacbaf727338 100644 +index 17d836cc84e8..dbacbaf72733 100644 --- a/lib/zstd/compress/zstd_compress_superblock.c +++ b/lib/zstd/compress/zstd_compress_superblock.c @@ -1,5 +1,6 @@ @@ -19396,7 +23175,7 @@ index 17d836cc84e8f..dbacbaf727338 100644 ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); } diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h -index 224ece79546eb..826bbc9e029b1 100644 +index 224ece79546e..826bbc9e029b 100644 --- a/lib/zstd/compress/zstd_compress_superblock.h +++ b/lib/zstd/compress/zstd_compress_superblock.h @@ -1,5 +1,6 @@ @@ -19408,7 +23187,7 @@ index 224ece79546eb..826bbc9e029b1 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h -index 349fc923c355a..65ea53b628447 100644 +index 349fc923c355..65ea53b62844 100644 --- a/lib/zstd/compress/zstd_cwksp.h +++ b/lib/zstd/compress/zstd_cwksp.h @@ -1,5 +1,6 @@ @@ -19713,7 +23492,7 @@ index 349fc923c355a..65ea53b628447 100644 diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c -index 76933dea2624e..ab9440a996039 100644 +index 76933dea2624..ab9440a99603 100644 --- a/lib/zstd/compress/zstd_double_fast.c +++ b/lib/zstd/compress/zstd_double_fast.c @@ -1,5 +1,6 @@ @@ -20026,7 +23805,7 @@ index 76933dea2624e..ab9440a996039 100644 hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; ip += repLength2; diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h -index 6822bde65a1d8..0204f12e4cf70 100644 +index 6822bde65a1d..0204f12e4cf7 100644 --- a/lib/zstd/compress/zstd_double_fast.h +++ b/lib/zstd/compress/zstd_double_fast.h @@ -1,5 +1,6 @@ @@ -20048,7 +23827,7 @@ index 6822bde65a1d8..0204f12e4cf70 100644 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c -index a752e6beab52e..3399b39c5dbc5 100644 +index a752e6beab52..3399b39c5dbc 100644 --- a/lib/zstd/compress/zstd_fast.c +++ b/lib/zstd/compress/zstd_fast.c @@ -1,5 +1,6 @@ @@ -20831,7 +24610,7 @@ index a752e6beab52e..3399b39c5dbc5 100644 { default: /* includes case 3 */ diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h -index fddc2f532d21d..e64d9e1b2d393 100644 +index fddc2f532d21..e64d9e1b2d39 100644 --- a/lib/zstd/compress/zstd_fast.h +++ b/lib/zstd/compress/zstd_fast.h @@ -1,5 +1,6 @@ @@ -20853,7 +24632,7 @@ index fddc2f532d21d..e64d9e1b2d393 100644 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c -index 0298a01a7504a..f6b4978ceba7f 100644 +index 0298a01a7504..f6b4978ceba7 100644 --- a/lib/zstd/compress/zstd_lazy.c +++ b/lib/zstd/compress/zstd_lazy.c @@ -1,5 +1,6 @@ @@ -21916,7 +25695,7 @@ index 0298a01a7504a..f6b4978ceba7f 100644 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); } diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h -index e5bdf4df8dde0..9505bed93c031 100644 +index e5bdf4df8dde..9505bed93c03 100644 --- a/lib/zstd/compress/zstd_lazy.h +++ b/lib/zstd/compress/zstd_lazy.h @@ -1,5 +1,6 @@ @@ -21946,7 +25725,7 @@ index e5bdf4df8dde0..9505bed93c031 100644 #endif /* ZSTD_LAZY_H */ diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c -index dd86fc83e7dde..b7da76b0db7c4 100644 +index dd86fc83e7dd..b7da76b0db7c 100644 --- a/lib/zstd/compress/zstd_ldm.c +++ b/lib/zstd/compress/zstd_ldm.c @@ -1,5 +1,6 @@ @@ -21990,7 +25769,7 @@ index dd86fc83e7dde..b7da76b0db7c4 100644 ip += sequence.matchLength; } diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h -index fbc6a5e88fd7a..c540731abde72 100644 +index fbc6a5e88fd7..c540731abde7 100644 --- a/lib/zstd/compress/zstd_ldm.h +++ b/lib/zstd/compress/zstd_ldm.h @@ -1,5 +1,6 @@ @@ -22002,7 +25781,7 @@ index fbc6a5e88fd7a..c540731abde72 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h -index 647f865be2903..cfccfc46f6f7b 100644 +index 647f865be290..cfccfc46f6f7 100644 --- a/lib/zstd/compress/zstd_ldm_geartab.h +++ b/lib/zstd/compress/zstd_ldm_geartab.h @@ -1,5 +1,6 @@ @@ -22014,7 +25793,7 @@ index 647f865be2903..cfccfc46f6f7b 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c -index fd82acfda62f6..1e41cb04f4820 100644 +index fd82acfda62f..1e41cb04f482 100644 --- a/lib/zstd/compress/zstd_opt.c +++ b/lib/zstd/compress/zstd_opt.c @@ -1,5 +1,6 @@ @@ -22496,7 +26275,7 @@ index fd82acfda62f6..1e41cb04f4820 100644 ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); } diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h -index 22b862858ba7a..faa73ff4b03dc 100644 +index 22b862858ba7..faa73ff4b03d 100644 --- a/lib/zstd/compress/zstd_opt.h +++ b/lib/zstd/compress/zstd_opt.h @@ -1,5 +1,6 @@ @@ -22508,7 +26287,7 @@ index 22b862858ba7a..faa73ff4b03dc 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c -index 60958afebc415..d172e35fbd9a6 100644 +index 60958afebc41..d172e35fbd9a 100644 --- a/lib/zstd/decompress/huf_decompress.c +++ b/lib/zstd/decompress/huf_decompress.c @@ -1,7 +1,8 @@ @@ -23699,7 +27478,7 @@ index 60958afebc415..d172e35fbd9a6 100644 } - diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c -index dbbc7919de534..30ef65e1ab5ca 100644 +index dbbc7919de53..30ef65e1ab5c 100644 --- a/lib/zstd/decompress/zstd_ddict.c +++ b/lib/zstd/decompress/zstd_ddict.c @@ -1,5 +1,6 @@ @@ -23741,7 +27520,7 @@ index dbbc7919de534..30ef65e1ab5ca 100644 + return ddict->dictID; } diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h -index 8c1a79d666f89..de459a0dacd19 100644 +index 8c1a79d666f8..de459a0dacd1 100644 --- a/lib/zstd/decompress/zstd_ddict.h +++ b/lib/zstd/decompress/zstd_ddict.h @@ -1,5 +1,6 @@ @@ -23753,7 +27532,7 @@ index 8c1a79d666f89..de459a0dacd19 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c -index 6b3177c947114..03dbdf39109f9 100644 +index 6b3177c94711..03dbdf39109f 100644 --- a/lib/zstd/decompress/zstd_decompress.c +++ b/lib/zstd/decompress/zstd_decompress.c @@ -1,5 +1,6 @@ @@ -24310,7 +28089,7 @@ index 6b3177c947114..03dbdf39109f9 100644 + } } diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c -index c1913b8e7c897..9f5577e5bc19d 100644 +index c1913b8e7c89..9f5577e5bc19 100644 --- a/lib/zstd/decompress/zstd_decompress_block.c +++ b/lib/zstd/decompress/zstd_decompress_block.c @@ -1,5 +1,6 @@ @@ -24847,7 +28626,7 @@ index c1913b8e7c897..9f5577e5bc19d 100644 + return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); +} diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h -index 3d2d57a5d25a7..5888e6cc788b5 100644 +index 3d2d57a5d25a..5888e6cc788b 100644 --- a/lib/zstd/decompress/zstd_decompress_block.h +++ b/lib/zstd/decompress/zstd_decompress_block.h @@ -1,5 +1,6 @@ @@ -24870,7 +28649,7 @@ index 3d2d57a5d25a7..5888e6cc788b5 100644 #endif /* ZSTD_DEC_BLOCK_H */ diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h -index 98102edb6a832..32f79fb2873df 100644 +index 98102edb6a83..32f79fb2873d 100644 --- a/lib/zstd/decompress/zstd_decompress_internal.h +++ b/lib/zstd/decompress/zstd_decompress_internal.h @@ -1,5 +1,6 @@ @@ -24905,7 +28684,7 @@ index 98102edb6a832..32f79fb2873df 100644 /* streaming */ ZSTD_dStreamStage streamStage; diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h -index a06ca187aab5f..8a47eb2a45145 100644 +index a06ca187aab5..8a47eb2a4514 100644 --- a/lib/zstd/decompress_sources.h +++ b/lib/zstd/decompress_sources.h @@ -1,6 +1,6 @@ @@ -24917,7 +28696,7 @@ index a06ca187aab5f..8a47eb2a45145 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c -index 22686e367e6f0..466828e357525 100644 +index 22686e367e6f..466828e35752 100644 --- a/lib/zstd/zstd_common_module.c +++ b/lib/zstd/zstd_common_module.c @@ -1,6 +1,6 @@ @@ -24939,7 +28718,7 @@ index 22686e367e6f0..466828e357525 100644 MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("Zstd Common"); diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c -index 04e1b5c01d9b6..8ecf43226af2f 100644 +index 04e1b5c01d9b..8ecf43226af2 100644 --- a/lib/zstd/zstd_compress_module.c +++ b/lib/zstd/zstd_compress_module.c @@ -1,6 +1,6 @@ @@ -24951,7 +28730,7 @@ index 04e1b5c01d9b6..8ecf43226af2f 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c -index f4ed952ed4852..eb1c49e69722f 100644 +index f4ed952ed485..eb1c49e69722 100644 --- a/lib/zstd/zstd_decompress_module.c +++ b/lib/zstd/zstd_decompress_module.c @@ -1,6 +1,6 @@ @@ -24963,4 +28742,4 @@ index f4ed952ed4852..eb1c49e69722f 100644 * * This source code is licensed under both the BSD-style license (found in the -- -2.41.0 +2.42.0 diff --git a/patches/0002-eevdf.patch b/patches/0002-eevdf.patch index bcda337..710de28 100644 --- a/patches/0002-eevdf.patch +++ b/patches/0002-eevdf.patch @@ -1,26 +1,27 @@ -From 6d15f875cb0c7fd65fc422c0545d57fc2e124f7c Mon Sep 17 00:00:00 2001 +From 9a3788351b1bc830a28d7a51740d2ee964ab8319 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 20 Aug 2023 15:56:13 +0200 -Subject: [PATCH] EEVDF-cachy +Date: Mon, 28 Aug 2023 14:04:00 +0200 +Subject: [PATCH] EEVDF Signed-off-by: Peter Jung --- - Documentation/admin-guide/cgroup-v2.rst | 10 + - include/linux/rbtree_augmented.h | 26 + - include/linux/sched.h | 8 +- - include/uapi/linux/sched.h | 4 +- - include/uapi/linux/sched/types.h | 19 + - init/init_task.c | 3 +- - kernel/sched/core.c | 65 +- - kernel/sched/debug.c | 49 +- - kernel/sched/fair.c | 1150 +++++++++++------------ - kernel/sched/features.h | 24 +- - kernel/sched/sched.h | 21 +- - tools/include/uapi/linux/sched.h | 4 +- - 12 files changed, 715 insertions(+), 668 deletions(-) + Documentation/admin-guide/cgroup-v2.rst | 10 + + Documentation/scheduler/sched-design-CFS.rst | 2 +- + include/linux/rbtree_augmented.h | 26 + + include/linux/sched.h | 8 +- + include/uapi/linux/sched.h | 4 +- + include/uapi/linux/sched/types.h | 19 + + init/init_task.c | 3 +- + kernel/sched/core.c | 65 +- + kernel/sched/debug.c | 49 +- + kernel/sched/fair.c | 1150 ++++++++---------- + kernel/sched/features.h | 24 +- + kernel/sched/sched.h | 21 +- + tools/include/uapi/linux/sched.h | 4 +- + 13 files changed, 716 insertions(+), 669 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst -index 4ef8901911961..3a8d3e1e55910 100644 +index 4ef890191196..3a8d3e1e5591 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1121,6 +1121,16 @@ All time durations are in microseconds. @@ -40,8 +41,21 @@ index 4ef8901911961..3a8d3e1e55910 100644 Memory +diff --git a/Documentation/scheduler/sched-design-CFS.rst b/Documentation/scheduler/sched-design-CFS.rst +index 03db55504515..f68919800f05 100644 +--- a/Documentation/scheduler/sched-design-CFS.rst ++++ b/Documentation/scheduler/sched-design-CFS.rst +@@ -94,7 +94,7 @@ other HZ detail. Thus the CFS scheduler has no notion of "timeslices" in the + way the previous scheduler had, and has no heuristics whatsoever. There is + only one central tunable (you have to switch on CONFIG_SCHED_DEBUG): + +- /sys/kernel/debug/sched/min_granularity_ns ++ /sys/kernel/debug/sched/base_slice_ns + + which can be used to tune the scheduler from "desktop" (i.e., low latencies) to + "server" (i.e., good batching) workloads. It defaults to a setting suitable diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h -index 7ee7ed5de7227..6dbc5a1bf6a8c 100644 +index 7ee7ed5de722..6dbc5a1bf6a8 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node, @@ -78,7 +92,7 @@ index 7ee7ed5de7227..6dbc5a1bf6a8c 100644 * Template for declaring augmented rbtree callbacks (generic case) * diff --git a/include/linux/sched.h b/include/linux/sched.h -index 609bde814cb06..c940c4dc83048 100644 +index 609bde814cb0..c940c4dc8304 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -549,13 +549,18 @@ struct sched_entity { @@ -110,7 +124,7 @@ index 609bde814cb06..c940c4dc83048 100644 struct sched_entity se; struct sched_rt_entity rt; diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h -index 3bac0a8ceab26..b2e932c25be62 100644 +index 3bac0a8ceab2..b2e932c25be6 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -132,6 +132,7 @@ struct clone_args { @@ -131,7 +145,7 @@ index 3bac0a8ceab26..b2e932c25be62 100644 #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h -index f2c4589d4dbfe..db1e8199e8c80 100644 +index f2c4589d4dbf..db1e8199e8c8 100644 --- a/include/uapi/linux/sched/types.h +++ b/include/uapi/linux/sched/types.h @@ -10,6 +10,7 @@ struct sched_param { @@ -175,7 +189,7 @@ index f2c4589d4dbfe..db1e8199e8c80 100644 #endif /* _UAPI_LINUX_SCHED_TYPES_H */ diff --git a/init/init_task.c b/init/init_task.c -index ff6c4b9bfe6b1..511cbcf3510dc 100644 +index ff6c4b9bfe6b..511cbcf3510d 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -78,6 +78,7 @@ struct task_struct init_task @@ -196,7 +210,7 @@ index ff6c4b9bfe6b1..511cbcf3510dc 100644 .rt = { .run_list = LIST_HEAD_INIT(init_task.rt.run_list), diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index c52c2eba7c739..aff81e12460ed 100644 +index c52c2eba7c73..aff81e12460e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load) @@ -358,7 +372,7 @@ index c52c2eba7c739..aff81e12460ed 100644 #ifdef CONFIG_CFS_BANDWIDTH { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 066ff1c8ae4eb..e7e83181fbb6c 100644 +index 066ff1c8ae4e..e7e83181fbb6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -347,10 +347,7 @@ static __init int sched_init_debug(void) @@ -462,7 +476,7 @@ index 066ff1c8ae4eb..e7e83181fbb6c 100644 P(dl.runtime); P(dl.deadline); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 2c335df301718..e0a4c13dab04f 100644 +index 2c335df30171..e0a4c13dab04 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -47,6 +47,7 @@ @@ -2075,7 +2089,7 @@ index 2c335df301718..e0a4c13dab04f 100644 return rr_interval; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index ee7f23c76bd33..546d212ef40d8 100644 +index ee7f23c76bd3..546d212ef40d 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -1,16 +1,12 @@ @@ -2122,7 +2136,7 @@ index ee7f23c76bd33..546d212ef40d8 100644 -SCHED_FEAT(ALT_PERIOD, true) -SCHED_FEAT(BASE_SLICE, true) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index e93e006a942b9..67cd7e1fd5016 100644 +index e93e006a942b..67cd7e1fd501 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -372,6 +372,8 @@ struct task_group { @@ -2202,7 +2216,7 @@ index e93e006a942b9..67cd7e1fd5016 100644 + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h -index 3bac0a8ceab26..b2e932c25be62 100644 +index 3bac0a8ceab2..b2e932c25be6 100644 --- a/tools/include/uapi/linux/sched.h +++ b/tools/include/uapi/linux/sched.h @@ -132,6 +132,7 @@ struct clone_args { @@ -2223,4 +2237,4 @@ index 3bac0a8ceab26..b2e932c25be62 100644 #endif /* _UAPI_LINUX_SCHED_H */ -- -2.41.0 +2.42.0 diff --git a/patches/0003-bcachefs.patch b/patches/0003-bcachefs.patch index d7ad7bd..5bee813 100644 --- a/patches/0003-bcachefs.patch +++ b/patches/0003-bcachefs.patch @@ -1,23 +1,19 @@ -From 5f9d0663e5c9895cfa7238b3456e2a268daf5878 Mon Sep 17 00:00:00 2001 +From 31f38fa87a86e086ffcc063e7e24702064eda50f Mon Sep 17 00:00:00 2001 From: Piotr Gorski -Date: Fri, 21 Jul 2023 08:07:37 +0200 +Date: Tue, 29 Aug 2023 12:14:18 +0200 Subject: [PATCH] bcachefs Signed-off-by: Piotr Gorski --- - Documentation/admin-guide/sysctl/vm.rst | 16 + - Documentation/filesystems/proc.rst | 28 + - MAINTAINERS | 56 + - arch/arm64/include/asm/spectre.h | 4 +- + MAINTAINERS | 32 + arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +- - arch/x86/kernel/amd_gart_64.c | 2 +- - block/bdev.c | 2 +- block/bio.c | 18 +- block/blk-core.c | 1 + block/blk.h | 1 - + drivers/accel/ivpu/ivpu_gem.c | 8 +- + drivers/accel/ivpu/ivpu_gem.h | 2 +- drivers/block/virtio_blk.c | 4 +- drivers/gpu/drm/gud/gud_drv.c | 2 +- - drivers/iommu/dma-iommu.c | 2 +- drivers/md/bcache/Kconfig | 10 +- drivers/md/bcache/Makefile | 4 +- drivers/md/bcache/bcache.h | 2 +- @@ -27,118 +23,125 @@ Signed-off-by: Piotr Gorski drivers/mtd/spi-nor/debugfs.c | 6 +- .../ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 4 +- drivers/scsi/sd.c | 8 +- - drivers/xen/grant-dma-ops.c | 2 +- - drivers/xen/swiotlb-xen.c | 2 +- fs/Kconfig | 1 + fs/Makefile | 1 + - fs/aio.c | 70 +- - fs/bcachefs/Kconfig | 77 + - fs/bcachefs/Makefile | 74 + - fs/bcachefs/acl.c | 412 ++ + fs/aio.c | 66 +- + fs/bcachefs/Kconfig | 76 + + fs/bcachefs/Makefile | 83 + + fs/bcachefs/acl.c | 412 +++ fs/bcachefs/acl.h | 58 + - fs/bcachefs/alloc_background.c | 2209 +++++++++ + fs/bcachefs/alloc_background.c | 2157 +++++++++++ fs/bcachefs/alloc_background.h | 257 ++ - fs/bcachefs/alloc_foreground.c | 1536 +++++++ - fs/bcachefs/alloc_foreground.h | 224 + + fs/bcachefs/alloc_foreground.c | 1571 ++++++++ + fs/bcachefs/alloc_foreground.h | 224 ++ fs/bcachefs/alloc_types.h | 126 + - fs/bcachefs/backpointers.c | 873 ++++ + fs/bcachefs/backpointers.c | 873 +++++ fs/bcachefs/backpointers.h | 131 + fs/bcachefs/bbpos.h | 48 + - fs/bcachefs/bcachefs.h | 1201 +++++ - fs/bcachefs/bcachefs_format.h | 2319 ++++++++++ + fs/bcachefs/bcachefs.h | 1146 ++++++ + fs/bcachefs/bcachefs_format.h | 2368 ++++++++++++ fs/bcachefs/bcachefs_ioctl.h | 368 ++ - fs/bcachefs/bkey.c | 1063 +++++ - fs/bcachefs/bkey.h | 774 ++++ + fs/bcachefs/bkey.c | 1107 ++++++ + fs/bcachefs/bkey.h | 782 ++++ fs/bcachefs/bkey_buf.h | 61 + fs/bcachefs/bkey_cmp.h | 129 + - fs/bcachefs/bkey_methods.c | 519 +++ - fs/bcachefs/bkey_methods.h | 193 + - fs/bcachefs/bkey_sort.c | 201 + + fs/bcachefs/bkey_methods.c | 456 +++ + fs/bcachefs/bkey_methods.h | 188 + + fs/bcachefs/bkey_sort.c | 201 ++ fs/bcachefs/bkey_sort.h | 44 + - fs/bcachefs/bset.c | 1587 +++++++ + fs/bcachefs/bset.c | 1587 ++++++++ fs/bcachefs/bset.h | 541 +++ - fs/bcachefs/btree_cache.c | 1277 ++++++ + fs/bcachefs/btree_cache.c | 1274 +++++++ fs/bcachefs/btree_cache.h | 130 + - fs/bcachefs/btree_gc.c | 2126 +++++++++ - fs/bcachefs/btree_gc.h | 113 + - fs/bcachefs/btree_io.c | 2267 ++++++++++ - fs/bcachefs/btree_io.h | 228 + - fs/bcachefs/btree_iter.c | 3214 +++++++++++++ - fs/bcachefs/btree_iter.h | 924 ++++ - fs/bcachefs/btree_key_cache.c | 1088 +++++ + fs/bcachefs/btree_gc.c | 2127 +++++++++++ + fs/bcachefs/btree_gc.h | 114 + + fs/bcachefs/btree_io.c | 2245 ++++++++++++ + fs/bcachefs/btree_io.h | 228 ++ + fs/bcachefs/btree_iter.c | 3194 +++++++++++++++++ + fs/bcachefs/btree_iter.h | 940 +++++ + fs/bcachefs/btree_journal_iter.c | 531 +++ + fs/bcachefs/btree_journal_iter.h | 57 + + fs/bcachefs/btree_key_cache.c | 1088 ++++++ fs/bcachefs/btree_key_cache.h | 48 + fs/bcachefs/btree_locking.c | 797 ++++ - fs/bcachefs/btree_locking.h | 424 ++ - fs/bcachefs/btree_types.h | 743 +++ - fs/bcachefs/btree_update.h | 352 ++ - fs/bcachefs/btree_update_interior.c | 2488 ++++++++++ + fs/bcachefs/btree_locking.h | 423 +++ + fs/bcachefs/btree_trans_commit.c | 1156 ++++++ + fs/bcachefs/btree_types.h | 746 ++++ + fs/bcachefs/btree_update.c | 898 +++++ + fs/bcachefs/btree_update.h | 353 ++ + fs/bcachefs/btree_update_interior.c | 2488 +++++++++++++ fs/bcachefs/btree_update_interior.h | 337 ++ - fs/bcachefs/btree_update_leaf.c | 2097 +++++++++ - fs/bcachefs/btree_write_buffer.c | 372 ++ + fs/bcachefs/btree_write_buffer.c | 375 ++ fs/bcachefs/btree_write_buffer.h | 14 + fs/bcachefs/btree_write_buffer_types.h | 44 + - fs/bcachefs/buckets.c | 2106 +++++++++ - fs/bcachefs/buckets.h | 368 ++ + fs/bcachefs/buckets.c | 2107 +++++++++++ + fs/bcachefs/buckets.h | 413 +++ fs/bcachefs/buckets_types.h | 92 + fs/bcachefs/buckets_waiting_for_journal.c | 166 + fs/bcachefs/buckets_waiting_for_journal.h | 15 + .../buckets_waiting_for_journal_types.h | 23 + fs/bcachefs/chardev.c | 769 ++++ fs/bcachefs/chardev.h | 31 + - fs/bcachefs/checksum.c | 709 +++ - fs/bcachefs/checksum.h | 209 + + fs/bcachefs/checksum.c | 753 ++++ + fs/bcachefs/checksum.h | 211 ++ fs/bcachefs/clock.c | 193 + fs/bcachefs/clock.h | 38 + fs/bcachefs/clock_types.h | 37 + - fs/bcachefs/compress.c | 713 +++ + fs/bcachefs/compress.c | 714 ++++ fs/bcachefs/compress.h | 55 + fs/bcachefs/counters.c | 107 + fs/bcachefs/counters.h | 17 + fs/bcachefs/darray.h | 87 + fs/bcachefs/data_update.c | 562 +++ fs/bcachefs/data_update.h | 43 + - fs/bcachefs/debug.c | 957 ++++ + fs/bcachefs/debug.c | 957 +++++ fs/bcachefs/debug.h | 32 + - fs/bcachefs/dirent.c | 565 +++ + fs/bcachefs/dirent.c | 590 +++ fs/bcachefs/dirent.h | 70 + - fs/bcachefs/disk_groups.c | 555 +++ + fs/bcachefs/disk_groups.c | 556 +++ fs/bcachefs/disk_groups.h | 106 + - fs/bcachefs/ec.c | 1960 ++++++++ - fs/bcachefs/ec.h | 263 ++ + fs/bcachefs/ec.c | 1972 ++++++++++ + fs/bcachefs/ec.h | 260 ++ fs/bcachefs/ec_types.h | 41 + fs/bcachefs/errcode.c | 63 + - fs/bcachefs/errcode.h | 246 + + fs/bcachefs/errcode.h | 252 ++ fs/bcachefs/error.c | 294 ++ - fs/bcachefs/error.h | 206 + + fs/bcachefs/error.h | 206 ++ fs/bcachefs/extent_update.c | 173 + fs/bcachefs/extent_update.h | 12 + - fs/bcachefs/extents.c | 1394 ++++++ + fs/bcachefs/extents.c | 1403 ++++++++ fs/bcachefs/extents.h | 757 ++++ fs/bcachefs/extents_types.h | 40 + fs/bcachefs/eytzinger.h | 281 ++ fs/bcachefs/fifo.h | 127 + fs/bcachefs/fs-common.c | 501 +++ fs/bcachefs/fs-common.h | 43 + - fs/bcachefs/fs-io.c | 3982 +++++++++++++++++ - fs/bcachefs/fs-io.h | 54 + - fs/bcachefs/fs-ioctl.c | 556 +++ + fs/bcachefs/fs-io-buffered.c | 1099 ++++++ + fs/bcachefs/fs-io-buffered.h | 27 + + fs/bcachefs/fs-io-direct.c | 679 ++++ + fs/bcachefs/fs-io-direct.h | 16 + + fs/bcachefs/fs-io-pagecache.c | 788 ++++ + fs/bcachefs/fs-io-pagecache.h | 176 + + fs/bcachefs/fs-io.c | 1250 +++++++ + fs/bcachefs/fs-io.h | 184 + + fs/bcachefs/fs-ioctl.c | 559 +++ fs/bcachefs/fs-ioctl.h | 81 + - fs/bcachefs/fs.c | 1943 ++++++++ - fs/bcachefs/fs.h | 208 + - fs/bcachefs/fsck.c | 2471 ++++++++++ + fs/bcachefs/fs.c | 1961 ++++++++++ + fs/bcachefs/fs.h | 209 ++ + fs/bcachefs/fsck.c | 2483 +++++++++++++ fs/bcachefs/fsck.h | 14 + - fs/bcachefs/inode.c | 925 ++++ - fs/bcachefs/inode.h | 201 + - fs/bcachefs/io.c | 3059 +++++++++++++ - fs/bcachefs/io.h | 202 + + fs/bcachefs/inode.c | 1111 ++++++ + fs/bcachefs/inode.h | 204 ++ + fs/bcachefs/io.c | 3051 ++++++++++++++++ + fs/bcachefs/io.h | 202 ++ fs/bcachefs/io_types.h | 165 + - fs/bcachefs/journal.c | 1438 ++++++ + fs/bcachefs/journal.c | 1438 ++++++++ fs/bcachefs/journal.h | 526 +++ - fs/bcachefs/journal_io.c | 1863 ++++++++ - fs/bcachefs/journal_io.h | 64 + - fs/bcachefs/journal_reclaim.c | 873 ++++ + fs/bcachefs/journal_io.c | 1888 ++++++++++ + fs/bcachefs/journal_io.h | 65 + + fs/bcachefs/journal_reclaim.c | 874 +++++ fs/bcachefs/journal_reclaim.h | 86 + - fs/bcachefs/journal_sb.c | 219 + + fs/bcachefs/journal_sb.c | 219 ++ fs/bcachefs/journal_sb.h | 24 + fs/bcachefs/journal_seq_blacklist.c | 322 ++ fs/bcachefs/journal_seq_blacklist.h | 22 + @@ -150,170 +153,124 @@ Signed-off-by: Piotr Gorski fs/bcachefs/lru.h | 69 + fs/bcachefs/migrate.c | 182 + fs/bcachefs/migrate.h | 7 + - fs/bcachefs/move.c | 1168 +++++ - fs/bcachefs/move.h | 96 + + fs/bcachefs/move.c | 1162 ++++++ + fs/bcachefs/move.h | 95 + fs/bcachefs/move_types.h | 36 + - fs/bcachefs/movinggc.c | 421 ++ + fs/bcachefs/movinggc.c | 423 +++ fs/bcachefs/movinggc.h | 12 + fs/bcachefs/nocow_locking.c | 123 + fs/bcachefs/nocow_locking.h | 49 + fs/bcachefs/nocow_locking_types.h | 20 + - fs/bcachefs/opts.c | 592 +++ + fs/bcachefs/opts.c | 599 ++++ fs/bcachefs/opts.h | 563 +++ - fs/bcachefs/printbuf.c | 415 ++ + fs/bcachefs/printbuf.c | 415 +++ fs/bcachefs/printbuf.h | 284 ++ - fs/bcachefs/quota.c | 981 ++++ + fs/bcachefs/quota.c | 981 +++++ fs/bcachefs/quota.h | 74 + fs/bcachefs/quota_types.h | 43 + - fs/bcachefs/rebalance.c | 364 ++ + fs/bcachefs/rebalance.c | 368 ++ fs/bcachefs/rebalance.h | 28 + fs/bcachefs/rebalance_types.h | 26 + - fs/bcachefs/recovery.c | 1670 +++++++ - fs/bcachefs/recovery.h | 60 + + fs/bcachefs/recovery.c | 1057 ++++++ + fs/bcachefs/recovery.h | 33 + + fs/bcachefs/recovery_types.h | 48 + fs/bcachefs/reflink.c | 399 ++ fs/bcachefs/reflink.h | 81 + - fs/bcachefs/replicas.c | 1059 +++++ + fs/bcachefs/replicas.c | 1059 ++++++ fs/bcachefs/replicas.h | 91 + fs/bcachefs/replicas_types.h | 27 + + fs/bcachefs/sb-clean.c | 395 ++ + fs/bcachefs/sb-clean.h | 16 + + fs/bcachefs/sb-members.c | 173 + + fs/bcachefs/sb-members.h | 176 + fs/bcachefs/seqmutex.h | 48 + fs/bcachefs/siphash.c | 173 + fs/bcachefs/siphash.h | 87 + + fs/bcachefs/six.c | 918 +++++ + fs/bcachefs/six.h | 388 ++ + fs/bcachefs/snapshot.c | 1687 +++++++++ + fs/bcachefs/snapshot.h | 272 ++ fs/bcachefs/str_hash.h | 370 ++ - fs/bcachefs/subvolume.c | 1749 ++++++++ - fs/bcachefs/subvolume.h | 258 ++ + fs/bcachefs/subvolume.c | 451 +++ + fs/bcachefs/subvolume.h | 35 + fs/bcachefs/subvolume_types.h | 31 + - fs/bcachefs/super-io.c | 1714 +++++++ - fs/bcachefs/super-io.h | 142 + - fs/bcachefs/super.c | 2007 +++++++++ - fs/bcachefs/super.h | 266 ++ - fs/bcachefs/super_types.h | 51 + - fs/bcachefs/sysfs.c | 1064 +++++ + fs/bcachefs/super-io.c | 1265 +++++++ + fs/bcachefs/super-io.h | 133 + + fs/bcachefs/super.c | 2015 +++++++++++ + fs/bcachefs/super.h | 52 + + fs/bcachefs/super_types.h | 52 + + fs/bcachefs/sysfs.c | 1059 ++++++ fs/bcachefs/sysfs.h | 48 + - fs/bcachefs/tests.c | 939 ++++ + fs/bcachefs/tests.c | 970 +++++ fs/bcachefs/tests.h | 15 + fs/bcachefs/trace.c | 16 + - fs/bcachefs/trace.h | 1247 ++++++ + fs/bcachefs/trace.h | 1265 +++++++ fs/bcachefs/two_state_shared_lock.c | 8 + fs/bcachefs/two_state_shared_lock.h | 59 + - fs/bcachefs/util.c | 1137 +++++ - fs/bcachefs/util.h | 846 ++++ - fs/bcachefs/varint.c | 122 + + fs/bcachefs/util.c | 1144 ++++++ + fs/bcachefs/util.h | 851 +++++ + fs/bcachefs/varint.c | 123 + fs/bcachefs/varint.h | 11 + fs/bcachefs/vstructs.h | 63 + - fs/bcachefs/xattr.c | 648 +++ + fs/bcachefs/xattr.c | 649 ++++ fs/bcachefs/xattr.h | 50 + fs/dcache.c | 12 +- fs/inode.c | 218 +- fs/iomap/buffered-io.c | 45 +- - fs/super.c | 40 +- fs/xfs/xfs_iomap.c | 3 + fs/xfs/xfs_mount.h | 2 + fs/xfs/xfs_super.c | 6 +- - include/asm-generic/codetag.lds.h | 15 + - include/asm-generic/vmlinux.lds.h | 3 + - include/linux/alloc_tag.h | 160 + include/linux/bio.h | 7 +- include/linux/blkdev.h | 1 + .../md/bcache => include/linux}/closure.h | 46 +- - include/linux/codetag.h | 110 + include/linux/dcache.h | 1 + - include/linux/dma-map-ops.h | 2 +- - include/linux/dynamic_fault.h | 79 + include/linux/exportfs.h | 6 + - include/linux/fortify-string.h | 5 +- - include/linux/fs.h | 16 +- + include/linux/fs.h | 15 +- include/linux/generic-radix-tree.h | 68 +- - include/linux/gfp.h | 111 +- - include/linux/gfp_types.h | 101 +- - include/linux/hrtimer.h | 2 +- + include/linux/gfp_types.h | 90 +- include/linux/iomap.h | 1 + include/linux/list_bl.h | 22 + include/linux/lockdep.h | 10 + include/linux/lockdep_types.h | 2 +- include/linux/mean_and_variance.h | 198 + - include/linux/memcontrol.h | 56 +- - include/linux/mempool.h | 73 +- - include/linux/mm.h | 8 + - include/linux/mm_types.h | 4 +- include/linux/nodemask.h | 2 +- include/linux/nodemask_types.h | 9 + - include/linux/page_ext.h | 1 - - include/linux/pagemap.h | 9 +- - include/linux/percpu.h | 19 +- - include/linux/pgalloc_tag.h | 105 + include/linux/prandom.h | 1 - - include/linux/rhashtable-types.h | 9 +- - include/linux/sched.h | 29 +- + include/linux/sched.h | 5 +- include/linux/seq_buf.h | 2 + include/linux/shrinker.h | 9 +- - include/linux/six.h | 388 ++ - include/linux/slab.h | 180 +- - include/linux/slab_def.h | 2 +- - include/linux/slub_def.h | 4 +- - include/linux/string.h | 5 +- include/linux/string_helpers.h | 13 +- - include/linux/time_namespace.h | 2 + - include/linux/vmalloc.h | 60 +- - init/Kconfig | 4 + init/init_task.c | 1 + - kernel/Kconfig.locks | 3 + - kernel/dma/mapping.c | 4 +- - kernel/locking/Makefile | 1 + kernel/locking/lockdep.c | 46 + + kernel/locking/mutex.c | 3 + kernel/locking/osq_lock.c | 2 + - kernel/locking/six.c | 893 ++++ - kernel/module/main.c | 25 +- kernel/stacktrace.c | 2 + lib/Kconfig | 3 + - lib/Kconfig.debug | 54 + - lib/Makefile | 9 +- - lib/alloc_tag.c | 225 + - {drivers/md/bcache => lib}/closure.c | 36 +- - lib/codetag.c | 393 ++ - lib/dynamic_fault.c | 371 ++ + lib/Kconfig.debug | 18 + + lib/Makefile | 2 + + {drivers/md/bcache => lib}/closure.c | 41 +- lib/errname.c | 1 + lib/generic-radix-tree.c | 76 +- lib/iov_iter.c | 43 +- lib/math/Kconfig | 3 + lib/math/Makefile | 2 + lib/math/mean_and_variance.c | 158 + - lib/math/mean_and_variance_test.c | 239 + - lib/rhashtable.c | 42 +- + lib/math/mean_and_variance_test.c | 239 ++ + lib/rhashtable.c | 9 +- lib/seq_buf.c | 10 + - lib/string.c | 19 + lib/string_helpers.c | 26 +- lib/test-string_helpers.c | 4 +- - mm/Makefile | 2 +- - mm/compaction.c | 10 +- - mm/filemap.c | 6 +- - mm/huge_memory.c | 2 + mm/hugetlb.c | 8 +- - mm/kfence/core.c | 14 +- - mm/kfence/kfence.h | 4 +- mm/madvise.c | 61 + - mm/memcontrol.c | 56 +- - mm/mempolicy.c | 42 +- - mm/mempool.c | 34 +- - mm/mm_init.c | 1 + mm/oom_kill.c | 23 - - mm/page_alloc.c | 66 +- - mm/page_ext.c | 13 + - mm/page_owner.c | 2 +- - mm/percpu-internal.h | 26 +- - mm/percpu.c | 120 +- - {lib => mm}/show_mem.c | 37 + - mm/slab.c | 24 +- - mm/slab.h | 252 +- - mm/slab_common.c | 148 +- - mm/slub.c | 26 +- - mm/util.c | 44 +- - mm/vmalloc.c | 88 +- + mm/show_mem.c | 22 + + mm/slab.h | 6 +- + mm/slab_common.c | 52 +- mm/vmscan.c | 99 +- scripts/Kbuild.include | 10 + scripts/Makefile.lib | 2 +- scripts/kallsyms.c | 13 + - scripts/module.lds.S | 7 + - 308 files changed, 96733 insertions(+), 930 deletions(-) + 265 files changed, 95211 insertions(+), 312 deletions(-) create mode 100644 fs/bcachefs/Kconfig create mode 100644 fs/bcachefs/Makefile create mode 100644 fs/bcachefs/acl.c @@ -347,15 +304,18 @@ Signed-off-by: Piotr Gorski create mode 100644 fs/bcachefs/btree_io.h create mode 100644 fs/bcachefs/btree_iter.c create mode 100644 fs/bcachefs/btree_iter.h + create mode 100644 fs/bcachefs/btree_journal_iter.c + create mode 100644 fs/bcachefs/btree_journal_iter.h create mode 100644 fs/bcachefs/btree_key_cache.c create mode 100644 fs/bcachefs/btree_key_cache.h create mode 100644 fs/bcachefs/btree_locking.c create mode 100644 fs/bcachefs/btree_locking.h + create mode 100644 fs/bcachefs/btree_trans_commit.c create mode 100644 fs/bcachefs/btree_types.h + create mode 100644 fs/bcachefs/btree_update.c create mode 100644 fs/bcachefs/btree_update.h create mode 100644 fs/bcachefs/btree_update_interior.c create mode 100644 fs/bcachefs/btree_update_interior.h - create mode 100644 fs/bcachefs/btree_update_leaf.c create mode 100644 fs/bcachefs/btree_write_buffer.c create mode 100644 fs/bcachefs/btree_write_buffer.h create mode 100644 fs/bcachefs/btree_write_buffer_types.h @@ -401,6 +361,12 @@ Signed-off-by: Piotr Gorski create mode 100644 fs/bcachefs/fifo.h create mode 100644 fs/bcachefs/fs-common.c create mode 100644 fs/bcachefs/fs-common.h + create mode 100644 fs/bcachefs/fs-io-buffered.c + create mode 100644 fs/bcachefs/fs-io-buffered.h + create mode 100644 fs/bcachefs/fs-io-direct.c + create mode 100644 fs/bcachefs/fs-io-direct.h + create mode 100644 fs/bcachefs/fs-io-pagecache.c + create mode 100644 fs/bcachefs/fs-io-pagecache.h create mode 100644 fs/bcachefs/fs-io.c create mode 100644 fs/bcachefs/fs-io.h create mode 100644 fs/bcachefs/fs-ioctl.c @@ -452,14 +418,23 @@ Signed-off-by: Piotr Gorski create mode 100644 fs/bcachefs/rebalance_types.h create mode 100644 fs/bcachefs/recovery.c create mode 100644 fs/bcachefs/recovery.h + create mode 100644 fs/bcachefs/recovery_types.h create mode 100644 fs/bcachefs/reflink.c create mode 100644 fs/bcachefs/reflink.h create mode 100644 fs/bcachefs/replicas.c create mode 100644 fs/bcachefs/replicas.h create mode 100644 fs/bcachefs/replicas_types.h + create mode 100644 fs/bcachefs/sb-clean.c + create mode 100644 fs/bcachefs/sb-clean.h + create mode 100644 fs/bcachefs/sb-members.c + create mode 100644 fs/bcachefs/sb-members.h create mode 100644 fs/bcachefs/seqmutex.h create mode 100644 fs/bcachefs/siphash.c create mode 100644 fs/bcachefs/siphash.h + create mode 100644 fs/bcachefs/six.c + create mode 100644 fs/bcachefs/six.h + create mode 100644 fs/bcachefs/snapshot.c + create mode 100644 fs/bcachefs/snapshot.h create mode 100644 fs/bcachefs/str_hash.h create mode 100644 fs/bcachefs/subvolume.c create mode 100644 fs/bcachefs/subvolume.h @@ -484,109 +459,18 @@ Signed-off-by: Piotr Gorski create mode 100644 fs/bcachefs/vstructs.h create mode 100644 fs/bcachefs/xattr.c create mode 100644 fs/bcachefs/xattr.h - create mode 100644 include/asm-generic/codetag.lds.h - create mode 100644 include/linux/alloc_tag.h rename {drivers/md/bcache => include/linux}/closure.h (93%) - create mode 100644 include/linux/codetag.h - create mode 100644 include/linux/dynamic_fault.h create mode 100644 include/linux/mean_and_variance.h create mode 100644 include/linux/nodemask_types.h - create mode 100644 include/linux/pgalloc_tag.h - create mode 100644 include/linux/six.h - create mode 100644 kernel/locking/six.c - create mode 100644 lib/alloc_tag.c - rename {drivers/md/bcache => lib}/closure.c (88%) - create mode 100644 lib/codetag.c - create mode 100644 lib/dynamic_fault.c + rename {drivers/md/bcache => lib}/closure.c (85%) create mode 100644 lib/math/mean_and_variance.c create mode 100644 lib/math/mean_and_variance_test.c - rename {lib => mm}/show_mem.c (57%) -diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst -index 45ba1f4dc..0a012ac13 100644 ---- a/Documentation/admin-guide/sysctl/vm.rst -+++ b/Documentation/admin-guide/sysctl/vm.rst -@@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm: - - legacy_va_layout - - lowmem_reserve_ratio - - max_map_count -+- mem_profiling (only if CONFIG_MEM_ALLOC_PROFILING=y) - - memory_failure_early_kill - - memory_failure_recovery - - min_free_kbytes -@@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation. - The default value is 65530. - - -+mem_profiling -+============== -+ -+Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y) -+ -+1: Enable memory profiling. -+ -+0: Disabld memory profiling. -+ -+Enabling memory profiling introduces a small performance overhead for all -+memory allocations. -+ -+The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. -+ -+ - memory_failure_early_kill: - ========================== - -diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst -index 7897a7daf..810f851e6 100644 ---- a/Documentation/filesystems/proc.rst -+++ b/Documentation/filesystems/proc.rst -@@ -683,6 +683,7 @@ files are there, and which are missing. - ============ =============================================================== - File Content - ============ =============================================================== -+ allocinfo Memory allocations profiling information - apm Advanced power management info - buddyinfo Kernel memory allocator information (see text) (2.5) - bus Directory containing bus specific information -@@ -942,6 +943,33 @@ also be allocatable although a lot of filesystem metadata may have to be - reclaimed to achieve this. - - -+allocinfo -+~~~~~~~ -+ -+Provides information about memory allocations at all locations in the code -+base. Each allocation in the code is identified by its source file, line -+number, module and the function calling the allocation. The number of bytes -+allocated at each location is reported. -+ -+Example output. -+ -+:: -+ -+ > cat /proc/allocinfo -+ -+ 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page -+ 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order -+ 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts -+ 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact -+ 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one -+ 1.16MiB fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc -+ 1.00MiB mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare -+ 734KiB fs/xfs/kmem.c:20 module:xfs func:kmem_alloc -+ 640KiB kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func -+ 640KiB drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf -+ ... -+ -+ - meminfo - ~~~~~~~ - diff --git a/MAINTAINERS b/MAINTAINERS -index 35e195946..22c57b3bc 100644 +index 4cc6bf79f..9c7fa5956 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -3522,6 +3522,14 @@ W: http://bcache.evilpiepirate.org +@@ -3458,6 +3458,14 @@ W: http://bcache.evilpiepirate.org C: irc://irc.oftc.net/bcache F: drivers/md/bcache/ @@ -601,7 +485,7 @@ index 35e195946..22c57b3bc 100644 BDISP ST MEDIA DRIVER M: Fabien Dessenne L: linux-media@vger.kernel.org -@@ -5064,6 +5072,14 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core +@@ -5027,6 +5035,14 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core F: Documentation/devicetree/bindings/timer/ F: drivers/clocksource/ @@ -616,21 +500,7 @@ index 35e195946..22c57b3bc 100644 CMPC ACPI DRIVER M: Thadeu Lima de Souza Cascardo M: Daniel Oliveira Nascimento -@@ -5114,6 +5130,13 @@ S: Supported - F: Documentation/process/code-of-conduct-interpretation.rst - F: Documentation/process/code-of-conduct.rst - -+CODE TAGGING -+M: Suren Baghdasaryan -+M: Kent Overstreet -+S: Maintained -+F: include/linux/codetag.h -+F: lib/codetag.c -+ - COMEDI DRIVERS - M: Ian Abbott - M: H Hartley Sweeten -@@ -8662,6 +8685,13 @@ F: Documentation/devicetree/bindings/power/power?domain* +@@ -8673,6 +8689,13 @@ F: Documentation/devicetree/bindings/power/power?domain* F: drivers/base/power/domain*.c F: include/linux/pm_domain.h @@ -644,9 +514,9 @@ index 35e195946..22c57b3bc 100644 GENERIC RESISTIVE TOUCHSCREEN ADC DRIVER M: Eugen Hristev L: linux-input@vger.kernel.org -@@ -12850,6 +12880,15 @@ F: Documentation/devicetree/bindings/net/ieee802154/mcr20a.txt - F: drivers/net/ieee802154/mcr20a.c - F: drivers/net/ieee802154/mcr20a.h +@@ -12925,6 +12948,15 @@ S: Maintained + F: drivers/net/mdio/mdio-regmap.c + F: include/linux/mdio/mdio-regmap.h +MEAN AND VARIANCE LIBRARY +M: Daniel B. Hill @@ -660,54 +530,8 @@ index 35e195946..22c57b3bc 100644 MEASUREMENT COMPUTING CIO-DAC IIO DRIVER M: William Breathitt Gray L: linux-iio@vger.kernel.org -@@ -13489,6 +13528,15 @@ F: mm/memblock.c - F: mm/mm_init.c - F: tools/testing/memblock/ - -+MEMORY ALLOCATION PROFILING -+M: Suren Baghdasaryan -+M: Kent Overstreet -+S: Maintained -+F: include/linux/alloc_tag.h -+F: include/linux/codetag_ctx.h -+F: lib/alloc_tag.c -+F: lib/pgalloc_tag.c -+ - MEMORY CONTROLLER DRIVERS - M: Krzysztof Kozlowski - L: linux-kernel@vger.kernel.org -@@ -19376,6 +19424,14 @@ S: Maintained - W: http://www.winischhofer.at/linuxsisusbvga.shtml - F: drivers/usb/misc/sisusbvga/ - -+SIX LOCKS -+M: Kent Overstreet -+L: linux-bcachefs@vger.kernel.org -+S: Supported -+C: irc://irc.oftc.net/bcache -+F: include/linux/six.h -+F: kernel/locking/six.c -+ - SL28 CPLD MFD DRIVER - M: Michael Walle - S: Maintained -diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h -index db7b371b3..31823d971 100644 ---- a/arch/arm64/include/asm/spectre.h -+++ b/arch/arm64/include/asm/spectre.h -@@ -13,8 +13,8 @@ - #define __BP_HARDEN_HYP_VECS_SZ ((BP_HARDEN_EL2_SLOTS - 1) * SZ_2K) - - #ifndef __ASSEMBLY__ -- --#include -+#include -+#include - - #include - #include diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c -index e8db8c8ef..1a3bd656f 100644 +index e7ea492ac..5936205bf 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -261,7 +261,7 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e @@ -719,34 +543,8 @@ index e8db8c8ef..1a3bd656f 100644 pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, exec ? " (exec)" : ""); -diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c -index 56a917df4..842a0ec5e 100644 ---- a/arch/x86/kernel/amd_gart_64.c -+++ b/arch/x86/kernel/amd_gart_64.c -@@ -676,7 +676,7 @@ static const struct dma_map_ops gart_dma_ops = { - .get_sgtable = dma_common_get_sgtable, - .dma_supported = dma_direct_supported, - .get_required_mask = dma_direct_get_required_mask, -- .alloc_pages = dma_direct_alloc_pages, -+ .alloc_pages_op = dma_direct_alloc_pages, - .free_pages = dma_direct_free_pages, - }; - -diff --git a/block/bdev.c b/block/bdev.c -index 21c63bfef..a4d7e8732 100644 ---- a/block/bdev.c -+++ b/block/bdev.c -@@ -934,7 +934,7 @@ EXPORT_SYMBOL(lookup_bdev); - - int __invalidate_device(struct block_device *bdev, bool kill_dirty) - { -- struct super_block *sb = get_super(bdev); -+ struct super_block *sb = try_get_super(bdev); - int res = 0; - - if (sb) { diff --git a/block/bio.c b/block/bio.c -index 043944fd4..70b5c987b 100644 +index 867217921..425b3da39 100644 --- a/block/bio.c +++ b/block/bio.c @@ -606,15 +606,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) @@ -768,7 +566,7 @@ index 043944fd4..70b5c987b 100644 /** * bio_truncate - truncate the bio to small size of @new_size -@@ -1245,7 +1245,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +@@ -1252,7 +1252,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct page **pages = (struct page **)bv; ssize_t size, left; unsigned len, i = 0; @@ -777,7 +575,7 @@ index 043944fd4..70b5c987b 100644 int ret = 0; /* -@@ -1274,10 +1274,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +@@ -1281,10 +1281,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); @@ -793,7 +591,7 @@ index 043944fd4..70b5c987b 100644 if (unlikely(!size)) { ret = -EFAULT; goto out; -@@ -1481,6 +1483,7 @@ void bio_set_pages_dirty(struct bio *bio) +@@ -1490,6 +1492,7 @@ void bio_set_pages_dirty(struct bio *bio) set_page_dirty_lock(bvec->bv_page); } } @@ -801,7 +599,7 @@ index 043944fd4..70b5c987b 100644 /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. -@@ -1540,6 +1543,7 @@ void bio_check_pages_dirty(struct bio *bio) +@@ -1549,6 +1552,7 @@ void bio_check_pages_dirty(struct bio *bio) spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } @@ -810,10 +608,10 @@ index 043944fd4..70b5c987b 100644 static inline bool bio_remaining_done(struct bio *bio) { diff --git a/block/blk-core.c b/block/blk-core.c -index 3fc68b944..1f23abb7d 100644 +index 9866468c7..9d51e9894 100644 --- a/block/blk-core.c +++ b/block/blk-core.c -@@ -205,6 +205,7 @@ const char *blk_status_to_str(blk_status_t status) +@@ -208,6 +208,7 @@ const char *blk_status_to_str(blk_status_t status) return ""; return blk_errors[idx].name; } @@ -822,7 +620,7 @@ index 3fc68b944..1f23abb7d 100644 /** * blk_sync_queue - cancel any pending callbacks on a queue diff --git a/block/blk.h b/block/blk.h -index 45547bcf1..f20f9ca03 100644 +index 608c5dcc5..47e03fc44 100644 --- a/block/blk.h +++ b/block/blk.h @@ -251,7 +251,6 @@ static inline void bio_integrity_free(struct bio *bio) @@ -833,11 +631,64 @@ index 45547bcf1..f20f9ca03 100644 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); +diff --git a/drivers/accel/ivpu/ivpu_gem.c b/drivers/accel/ivpu/ivpu_gem.c +index 9967fcfa2..4e8122fb6 100644 +--- a/drivers/accel/ivpu/ivpu_gem.c ++++ b/drivers/accel/ivpu/ivpu_gem.c +@@ -61,7 +61,7 @@ static void prime_unmap_pages_locked(struct ivpu_bo *bo) + static const struct ivpu_bo_ops prime_ops = { + .type = IVPU_BO_TYPE_PRIME, + .name = "prime", +- .alloc_pages = prime_alloc_pages_locked, ++ .alloc_pages_op = prime_alloc_pages_locked, + .free_pages = prime_free_pages_locked, + .map_pages = prime_map_pages_locked, + .unmap_pages = prime_unmap_pages_locked, +@@ -134,7 +134,7 @@ static void ivpu_bo_unmap_pages_locked(struct ivpu_bo *bo) + static const struct ivpu_bo_ops shmem_ops = { + .type = IVPU_BO_TYPE_SHMEM, + .name = "shmem", +- .alloc_pages = shmem_alloc_pages_locked, ++ .alloc_pages_op = shmem_alloc_pages_locked, + .free_pages = shmem_free_pages_locked, + .map_pages = ivpu_bo_map_pages_locked, + .unmap_pages = ivpu_bo_unmap_pages_locked, +@@ -186,7 +186,7 @@ static void internal_free_pages_locked(struct ivpu_bo *bo) + static const struct ivpu_bo_ops internal_ops = { + .type = IVPU_BO_TYPE_INTERNAL, + .name = "internal", +- .alloc_pages = internal_alloc_pages_locked, ++ .alloc_pages_op = internal_alloc_pages_locked, + .free_pages = internal_free_pages_locked, + .map_pages = ivpu_bo_map_pages_locked, + .unmap_pages = ivpu_bo_unmap_pages_locked, +@@ -200,7 +200,7 @@ static int __must_check ivpu_bo_alloc_and_map_pages_locked(struct ivpu_bo *bo) + lockdep_assert_held(&bo->lock); + drm_WARN_ON(&vdev->drm, bo->sgt); + +- ret = bo->ops->alloc_pages(bo); ++ ret = bo->ops->alloc_pages_op(bo); + if (ret) { + ivpu_err(vdev, "Failed to allocate pages for BO: %d", ret); + return ret; +diff --git a/drivers/accel/ivpu/ivpu_gem.h b/drivers/accel/ivpu/ivpu_gem.h +index 6b0ceda5f..b81cf2af0 100644 +--- a/drivers/accel/ivpu/ivpu_gem.h ++++ b/drivers/accel/ivpu/ivpu_gem.h +@@ -42,7 +42,7 @@ enum ivpu_bo_type { + struct ivpu_bo_ops { + enum ivpu_bo_type type; + const char *name; +- int (*alloc_pages)(struct ivpu_bo *bo); ++ int (*alloc_pages_op)(struct ivpu_bo *bo); + void (*free_pages)(struct ivpu_bo *bo); + int (*map_pages)(struct ivpu_bo *bo); + void (*unmap_pages)(struct ivpu_bo *bo); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c -index b47358da9..be10661f1 100644 +index 1fe011676..59140424d 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c -@@ -990,9 +990,9 @@ static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize) +@@ -986,9 +986,9 @@ static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize) nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9); string_get_size(nblocks, queue_logical_block_size(q), @@ -862,19 +713,6 @@ index 9d7bf8ee4..6b1748e1f 100644 seq_printf(m, "Max buffer size: %s\n", buf); seq_printf(m, "Number of errors: %u\n", gdrm->stats_num_errors); -diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c -index 7a9f0b0bd..76a9d5ca4 100644 ---- a/drivers/iommu/dma-iommu.c -+++ b/drivers/iommu/dma-iommu.c -@@ -1556,7 +1556,7 @@ static const struct dma_map_ops iommu_dma_ops = { - .flags = DMA_F_PCI_P2PDMA_SUPPORTED, - .alloc = iommu_dma_alloc, - .free = iommu_dma_free, -- .alloc_pages = dma_common_alloc_pages, -+ .alloc_pages_op = dma_common_alloc_pages, - .free_pages = dma_common_free_pages, - .alloc_noncontiguous = iommu_dma_alloc_noncontiguous, - .free_noncontiguous = iommu_dma_free_noncontiguous, diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 529c9d04e..b2d10063d 100644 --- a/drivers/md/bcache/Kconfig @@ -917,7 +755,7 @@ index 5b87e5967..054e8a33a 100644 + journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ util.o writeback.o features.o diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index aebb7ef10..c8b4914ad 100644 +index 5a79bb3c2..7c0d00432 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -179,6 +179,7 @@ @@ -937,10 +775,10 @@ index aebb7ef10..c8b4914ad 100644 struct bucket { atomic_t pin; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 077149c40..d43079d45 100644 +index 0ae2b3676..4affe5875 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c -@@ -2911,7 +2911,6 @@ static int __init bcache_init(void) +@@ -2905,7 +2905,6 @@ static int __init bcache_init(void) goto err; bch_debug_init(); @@ -970,19 +808,19 @@ index 6f3cb7c92..f61ab1bad 100644 #ifdef CONFIG_BCACHE_DEBUG diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c -index e46330815..b5dfaf680 100644 +index b6f4be25b..a09ce965c 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c -@@ -2509,7 +2509,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, +@@ -2510,7 +2510,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, blk_queue_write_cache(md->queue.queue, cache_enabled, fua_enabled); - string_get_size((u64)size, 512, STRING_UNITS_2, + string_get_size((u64)size, 512, STRING_SIZE_BASE2, cap_str, sizeof(cap_str)); - pr_info("%s: %s %s %s %s\n", + pr_info("%s: %s %s %s%s\n", md->disk->disk_name, mmc_card_id(card), mmc_card_name(card), -@@ -2705,7 +2705,7 @@ static int mmc_blk_alloc_rpmb_part(struct mmc_card *card, +@@ -2706,7 +2706,7 @@ static int mmc_blk_alloc_rpmb_part(struct mmc_card *card, list_add(&rpmb->node, &md->rpmbs); @@ -1037,10 +875,10 @@ index 14e0d989c..7d5fbebd3 100644 } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c -index 1624d528a..bf0a1907b 100644 +index 3c668cfb1..c9abe8f9a 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c -@@ -2580,10 +2580,10 @@ sd_print_capacity(struct scsi_disk *sdkp, +@@ -2681,10 +2681,10 @@ sd_print_capacity(struct scsi_disk *sdkp, if (!sdkp->first_scan && old_capacity == sdkp->capacity) return; @@ -1055,31 +893,6 @@ index 1624d528a..bf0a1907b 100644 sd_printk(KERN_NOTICE, sdkp, "%llu %d-byte logical blocks: (%s/%s)\n", -diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c -index 9784a77fa..6c7d984f1 100644 ---- a/drivers/xen/grant-dma-ops.c -+++ b/drivers/xen/grant-dma-ops.c -@@ -282,7 +282,7 @@ static int xen_grant_dma_supported(struct device *dev, u64 mask) - static const struct dma_map_ops xen_grant_dma_ops = { - .alloc = xen_grant_dma_alloc, - .free = xen_grant_dma_free, -- .alloc_pages = xen_grant_dma_alloc_pages, -+ .alloc_pages_op = xen_grant_dma_alloc_pages, - .free_pages = xen_grant_dma_free_pages, - .mmap = dma_common_mmap, - .get_sgtable = dma_common_get_sgtable, -diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c -index 67aa74d20..5ab261615 100644 ---- a/drivers/xen/swiotlb-xen.c -+++ b/drivers/xen/swiotlb-xen.c -@@ -403,6 +403,6 @@ const struct dma_map_ops xen_swiotlb_dma_ops = { - .dma_supported = xen_swiotlb_dma_supported, - .mmap = dma_common_mmap, - .get_sgtable = dma_common_get_sgtable, -- .alloc_pages = dma_common_alloc_pages, -+ .alloc_pages_op = dma_common_alloc_pages, - .free_pages = dma_common_free_pages, - }; diff --git a/fs/Kconfig b/fs/Kconfig index 18d034ec7..b05c45f63 100644 --- a/fs/Kconfig @@ -1093,10 +906,10 @@ index 18d034ec7..b05c45f63 100644 endif # BLOCK diff --git a/fs/Makefile b/fs/Makefile -index 5bfdbf0d7..977a05cae 100644 +index e513aaee0..cd357ea45 100644 --- a/fs/Makefile +++ b/fs/Makefile -@@ -129,6 +129,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ +@@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ obj-$(CONFIG_F2FS_FS) += f2fs/ @@ -1105,10 +918,10 @@ index 5bfdbf0d7..977a05cae 100644 obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/aio.c b/fs/aio.c -index b0b17bd09..b3e14a9fe 100644 +index 77e33619d..5db996acc 100644 --- a/fs/aio.c +++ b/fs/aio.c -@@ -1109,6 +1109,11 @@ static inline void iocb_destroy(struct aio_kiocb *iocb) +@@ -1106,6 +1106,11 @@ static inline void iocb_destroy(struct aio_kiocb *iocb) kmem_cache_free(kiocb_cachep, iocb); } @@ -1120,7 +933,7 @@ index b0b17bd09..b3e14a9fe 100644 /* aio_complete * Called when the io request on the given iocb is complete. */ -@@ -1117,7 +1122,7 @@ static void aio_complete(struct aio_kiocb *iocb) +@@ -1114,7 +1119,7 @@ static void aio_complete(struct aio_kiocb *iocb) struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; @@ -1129,7 +942,7 @@ index b0b17bd09..b3e14a9fe 100644 unsigned long flags; /* -@@ -1161,6 +1166,10 @@ static void aio_complete(struct aio_kiocb *iocb) +@@ -1156,6 +1161,10 @@ static void aio_complete(struct aio_kiocb *iocb) ctx->completed_events++; if (ctx->completed_events > 1) refill_reqs_available(ctx, head, tail); @@ -1140,7 +953,7 @@ index b0b17bd09..b3e14a9fe 100644 spin_unlock_irqrestore(&ctx->completion_lock, flags); pr_debug("added to ring %p at [%u]\n", iocb, tail); -@@ -1181,8 +1190,18 @@ static void aio_complete(struct aio_kiocb *iocb) +@@ -1176,8 +1185,18 @@ static void aio_complete(struct aio_kiocb *iocb) */ smp_mb(); @@ -1161,20 +974,7 @@ index b0b17bd09..b3e14a9fe 100644 } static inline void iocb_put(struct aio_kiocb *iocb) -@@ -1250,10 +1269,10 @@ static long aio_read_events_ring(struct kioctx *ctx, - avail = min(avail, nr - ret); - avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); - -- ev = kmap(page); -+ ev = kmap_local_page(page); - copy_ret = copy_to_user(event + ret, ev + pos, - sizeof(*ev) * avail); -- kunmap(page); -+ kunmap_local(ev); - - if (unlikely(copy_ret)) { - ret = -EFAULT; -@@ -1298,7 +1317,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, +@@ -1290,7 +1309,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, ktime_t until) { @@ -1185,7 +985,7 @@ index b0b17bd09..b3e14a9fe 100644 /* * Note that aio_read_events() is being called as the conditional - i.e. -@@ -1314,12 +1335,37 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, +@@ -1306,12 +1327,37 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ @@ -1231,10 +1031,10 @@ index b0b17bd09..b3e14a9fe 100644 diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig new file mode 100644 -index 000000000..6c698b3b3 +index 000000000..fb5b24f20 --- /dev/null +++ b/fs/bcachefs/Kconfig -@@ -0,0 +1,77 @@ +@@ -0,0 +1,76 @@ + +config BCACHEFS_FS + tristate "bcachefs filesystem support (EXPERIMENTAL)" @@ -1256,7 +1056,6 @@ index 000000000..6c698b3b3 + select CRYPTO_CHACHA20 + select CRYPTO_POLY1305 + select KEYS -+ select SIXLOCKS + select RAID6_PQ + select XOR_BLOCKS + select XXHASH @@ -1314,10 +1113,10 @@ index 000000000..6c698b3b3 + This disables device latency tracking and time stats, only for performance testing diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile new file mode 100644 -index 000000000..a71956048 +index 000000000..c87be5fb7 --- /dev/null +++ b/fs/bcachefs/Makefile -@@ -0,0 +1,74 @@ +@@ -0,0 +1,83 @@ + +obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o + @@ -1333,10 +1132,12 @@ index 000000000..a71956048 + btree_gc.o \ + btree_io.o \ + btree_iter.o \ ++ btree_journal_iter.o \ + btree_key_cache.o \ + btree_locking.o \ ++ btree_trans_commit.o \ ++ btree_update.o \ + btree_update_interior.o \ -+ btree_update_leaf.o \ + btree_write_buffer.o \ + buckets.o \ + buckets_waiting_for_journal.o \ @@ -1358,6 +1159,9 @@ index 000000000..a71956048 + fs-common.o \ + fs-ioctl.o \ + fs-io.o \ ++ fs-io-buffered.o \ ++ fs-io-direct.o \ ++ fs-io-pagecache.o \ + fsck.o \ + inode.o \ + io.o \ @@ -1379,7 +1183,11 @@ index 000000000..a71956048 + recovery.o \ + reflink.o \ + replicas.o \ ++ sb-clean.o \ ++ sb-members.o \ + siphash.o \ ++ six.o \ ++ snapshot.o \ + subvolume.o \ + super.o \ + super-io.o \ @@ -1876,10 +1684,10 @@ index 000000000..bb21d8d69 +#endif /* _BCACHEFS_ACL_H */ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c new file mode 100644 -index 000000000..8d8481fc1 +index 000000000..540d94c0c --- /dev/null +++ b/fs/bcachefs/alloc_background.c -@@ -0,0 +1,2209 @@ +@@ -0,0 +1,2157 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" @@ -1961,36 +1769,6 @@ index 000000000..8d8481fc1 + return v; +} + -+static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p, -+ unsigned field, u64 v) -+{ -+ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; -+ -+ if (!v) -+ return; -+ -+ a->v.fields |= 1 << field; -+ -+ switch (bytes) { -+ case 1: -+ *((u8 *) *p) = v; -+ break; -+ case 2: -+ *((__le16 *) *p) = cpu_to_le16(v); -+ break; -+ case 4: -+ *((__le32 *) *p) = cpu_to_le32(v); -+ break; -+ case 8: -+ *((__le64 *) *p) = cpu_to_le64(v); -+ break; -+ default: -+ BUG(); -+ } -+ -+ *p += bytes; -+} -+ +static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) +{ @@ -2149,10 +1927,9 @@ index 000000000..8d8481fc1 +} + +int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, struct printbuf *err) +{ + struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); -+ int rw = flags & WRITE; + + if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) { + prt_printf(err, "bad val size (%u > %lu)", @@ -2166,71 +1943,50 @@ index 000000000..8d8481fc1 + return -BCH_ERR_invalid_bkey; + } + -+ if (rw == WRITE && -+ !(flags & BKEY_INVALID_JOURNAL) && -+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_btree_backpointers) { -+ unsigned i, bp_len = 0; -+ -+ for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) -+ bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len; -+ -+ if (bp_len > a.v->dirty_sectors) { -+ prt_printf(err, "too many backpointers"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { ++ prt_printf(err, "invalid data type (got %u should be %u)", ++ a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); ++ return -BCH_ERR_invalid_bkey; + } + -+ if (rw == WRITE) { -+ if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { -+ prt_printf(err, "invalid data type (got %u should be %u)", -+ a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); ++ switch (a.v->data_type) { ++ case BCH_DATA_free: ++ case BCH_DATA_need_gc_gens: ++ case BCH_DATA_need_discard: ++ if (a.v->dirty_sectors || ++ a.v->cached_sectors || ++ a.v->stripe) { ++ prt_printf(err, "empty data type free but have data"); ++ return -BCH_ERR_invalid_bkey; ++ } ++ break; ++ case BCH_DATA_sb: ++ case BCH_DATA_journal: ++ case BCH_DATA_btree: ++ case BCH_DATA_user: ++ case BCH_DATA_parity: ++ if (!a.v->dirty_sectors) { ++ prt_printf(err, "data_type %s but dirty_sectors==0", ++ bch2_data_types[a.v->data_type]); ++ return -BCH_ERR_invalid_bkey; ++ } ++ break; ++ case BCH_DATA_cached: ++ if (!a.v->cached_sectors || ++ a.v->dirty_sectors || ++ a.v->stripe) { ++ prt_printf(err, "data type inconsistency"); + return -BCH_ERR_invalid_bkey; + } + -+ switch (a.v->data_type) { -+ case BCH_DATA_free: -+ case BCH_DATA_need_gc_gens: -+ case BCH_DATA_need_discard: -+ if (a.v->dirty_sectors || -+ a.v->cached_sectors || -+ a.v->stripe) { -+ prt_printf(err, "empty data type free but have data"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ break; -+ case BCH_DATA_sb: -+ case BCH_DATA_journal: -+ case BCH_DATA_btree: -+ case BCH_DATA_user: -+ case BCH_DATA_parity: -+ if (!a.v->dirty_sectors) { -+ prt_printf(err, "data_type %s but dirty_sectors==0", -+ bch2_data_types[a.v->data_type]); -+ return -BCH_ERR_invalid_bkey; -+ } -+ break; -+ case BCH_DATA_cached: -+ if (!a.v->cached_sectors || -+ a.v->dirty_sectors || -+ a.v->stripe) { -+ prt_printf(err, "data type inconsistency"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (!a.v->io_time[READ] && -+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) { -+ prt_printf(err, "cached bucket with read_time == 0"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ break; -+ case BCH_DATA_stripe: -+ if (!a.v->stripe) { -+ prt_printf(err, "data_type %s but stripe==0", -+ bch2_data_types[a.v->data_type]); -+ return -BCH_ERR_invalid_bkey; -+ } -+ break; ++ if (!a.v->io_time[READ] && ++ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) { ++ prt_printf(err, "cached bucket with read_time == 0"); ++ return -BCH_ERR_invalid_bkey; + } ++ break; ++ case BCH_DATA_stripe: ++ break; + } + + return 0; @@ -3216,7 +2972,7 @@ index 000000000..8d8481fc1 + struct btree_iter *iter, + struct bpos end) +{ -+ if (!btree_node_type_is_extents(iter->btree_id)) { ++ if (!btree_id_is_extents(iter->btree_id)) { + return __bch2_check_discard_freespace_key(trans, iter); + } else { + int ret; @@ -4354,10 +4110,10 @@ index 000000000..c0914feb5 +#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c new file mode 100644 -index 000000000..fcb7311b1 +index 000000000..e02749ddc --- /dev/null +++ b/fs/bcachefs/alloc_foreground.c -@@ -0,0 +1,1536 @@ +@@ -0,0 +1,1571 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2012 Google, Inc. @@ -5349,7 +5105,6 @@ index 000000000..fcb7311b1 + cl = _cl; + goto retry_blocking; + } -+ + } + + return ret; @@ -5391,6 +5146,16 @@ index 000000000..fcb7311b1 + return ret < 0 ? ret : 0; +} + ++/** ++ * should_drop_bucket - check if this is open_bucket should go away ++ * @ca: if set, we're killing buckets for a particular device ++ * @ec: if true, we're shutting down erasure coding and killing all ec ++ * open_buckets ++ * otherwise, return true ++ * ++ * We're killing open_buckets because we're shutting down a device, erasure ++ * coding, or the entire filesystem - check if this open_bucket matches: ++ */ +static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, + struct bch_dev *ca, bool ec) +{ @@ -5402,8 +5167,12 @@ index 000000000..fcb7311b1 + unsigned i; + + if (!drop && ob->ec) { ++ unsigned nr_blocks; ++ + mutex_lock(&ob->ec->lock); -+ for (i = 0; i < ob->ec->new_stripe.key.v.nr_blocks; i++) { ++ nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; ++ ++ for (i = 0; i < nr_blocks; i++) { + if (!ob->ec->blocks[i]) + continue; + @@ -5872,31 +5641,53 @@ index 000000000..fcb7311b1 + NULL +}; + ++static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, ++ struct write_point *wp) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ prt_printf(out, "%lu: ", wp->write_point); ++ prt_human_readable_u64(out, wp->sectors_allocated); ++ ++ prt_printf(out, " last wrote: "); ++ bch2_pr_time_units(out, sched_clock() - wp->last_used); ++ ++ for (i = 0; i < WRITE_POINT_STATE_NR; i++) { ++ prt_printf(out, " %s: ", bch2_write_point_states[i]); ++ bch2_pr_time_units(out, wp->time[i]); ++ } ++ ++ prt_newline(out); ++ ++ printbuf_indent_add(out, 2); ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ bch2_open_bucket_to_text(out, c, ob); ++ printbuf_indent_sub(out, 2); ++} ++ +void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct write_point *wp; -+ unsigned i; + ++ prt_str(out, "Foreground write points\n"); + for (wp = c->write_points; + wp < c->write_points + ARRAY_SIZE(c->write_points); -+ wp++) { -+ prt_printf(out, "%lu: ", wp->write_point); -+ prt_human_readable_u64(out, wp->sectors_allocated); ++ wp++) ++ bch2_write_point_to_text(out, c, wp); + -+ prt_printf(out, " last wrote: "); -+ bch2_pr_time_units(out, sched_clock() - wp->last_used); ++ prt_str(out, "Copygc write point\n"); ++ bch2_write_point_to_text(out, c, &c->copygc_write_point); + -+ for (i = 0; i < WRITE_POINT_STATE_NR; i++) { -+ prt_printf(out, " %s: ", bch2_write_point_states[i]); -+ bch2_pr_time_units(out, wp->time[i]); -+ } ++ prt_str(out, "Rebalance write point\n"); ++ bch2_write_point_to_text(out, c, &c->rebalance_write_point); + -+ prt_newline(out); -+ } ++ prt_str(out, "Btree write point\n"); ++ bch2_write_point_to_text(out, c, &c->btree_write_point); +} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h new file mode 100644 -index 000000000..fee195f7e +index 000000000..7aaeec44c --- /dev/null +++ b/fs/bcachefs/alloc_foreground.h @@ -0,0 +1,224 @@ @@ -5907,7 +5698,7 @@ index 000000000..fee195f7e +#include "bcachefs.h" +#include "alloc_types.h" +#include "extents.h" -+#include "super.h" ++#include "sb-members.h" + +#include + @@ -6126,7 +5917,7 @@ index 000000000..fee195f7e +#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h new file mode 100644 -index 000000000..c33a29954 +index 000000000..b91b7a461 --- /dev/null +++ b/fs/bcachefs/alloc_types.h @@ -0,0 +1,126 @@ @@ -6164,7 +5955,7 @@ index 000000000..c33a29954 +}; + +#define BCH_WATERMARK_BITS 3 -+#define BCH_WATERMARK_MASK ~(~0 << BCH_WATERMARK_BITS) ++#define BCH_WATERMARK_MASK ~(~0U << BCH_WATERMARK_BITS) + +#define OPEN_BUCKETS_COUNT 1024 + @@ -6237,7 +6028,7 @@ index 000000000..c33a29954 + struct dev_stripe_state stripe; + + u64 sectors_allocated; -+ } __attribute__((__aligned__(SMP_CACHE_BYTES))); ++ } __aligned(SMP_CACHE_BYTES); + + struct { + struct work_struct index_update_work; @@ -6248,7 +6039,7 @@ index 000000000..c33a29954 + enum write_point_state state; + u64 last_state_change; + u64 time[WRITE_POINT_STATE_NR]; -+ } __attribute__((__aligned__(SMP_CACHE_BYTES))); ++ } __aligned(SMP_CACHE_BYTES); +}; + +struct write_point_specifier { @@ -7328,10 +7119,10 @@ index 000000000..1fbed1f83 +#endif /* _BCACHEFS_BBPOS_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h new file mode 100644 -index 000000000..82b0706a8 +index 000000000..30b3d7b9f --- /dev/null +++ b/fs/bcachefs/bcachefs.h -@@ -0,0 +1,1201 @@ +@@ -0,0 +1,1146 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_H +#define _BCACHEFS_H @@ -7542,6 +7333,7 @@ index 000000000..82b0706a8 +#include "fifo.h" +#include "nocow_locking_types.h" +#include "opts.h" ++#include "recovery_types.h" +#include "seqmutex.h" +#include "util.h" + @@ -7627,8 +7419,8 @@ index 000000000..82b0706a8 + +#define bch_err_fn(_c, _ret) \ + bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret)) -+#define bch_err_msg(_c, _ret, _msg) \ -+ bch_err(_c, "%s(): error " _msg " %s", __func__, bch2_err_str(_ret)) ++#define bch_err_msg(_c, _ret, _msg, ...) \ ++ bch_err(_c, "%s(): error " _msg " %s", __func__, ##__VA_ARGS__, bch2_err_str(_ret)) + +#define bch_verbose(c, fmt, ...) \ +do { \ @@ -7786,6 +7578,7 @@ index 000000000..82b0706a8 + GC_PHASE_BTREE_backpointers, + GC_PHASE_BTREE_bucket_gens, + GC_PHASE_BTREE_snapshot_trees, ++ GC_PHASE_BTREE_deleted_inodes, + + GC_PHASE_PENDING_DELETE, +}; @@ -7989,48 +7782,6 @@ index 000000000..82b0706a8 + BCH_WRITE_REF_NR, +}; + -+#define PASS_SILENT BIT(0) -+#define PASS_FSCK BIT(1) -+#define PASS_UNCLEAN BIT(2) -+#define PASS_ALWAYS BIT(3) -+ -+#define BCH_RECOVERY_PASSES() \ -+ x(alloc_read, PASS_ALWAYS) \ -+ x(stripes_read, PASS_ALWAYS) \ -+ x(initialize_subvolumes, 0) \ -+ x(snapshots_read, PASS_ALWAYS) \ -+ x(check_topology, 0) \ -+ x(check_allocations, PASS_FSCK) \ -+ x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \ -+ x(journal_replay, PASS_ALWAYS) \ -+ x(check_alloc_info, PASS_FSCK) \ -+ x(check_lrus, PASS_FSCK) \ -+ x(check_btree_backpointers, PASS_FSCK) \ -+ x(check_backpointers_to_extents,PASS_FSCK) \ -+ x(check_extents_to_backpointers,PASS_FSCK) \ -+ x(check_alloc_to_lru_refs, PASS_FSCK) \ -+ x(fs_freespace_init, PASS_ALWAYS|PASS_SILENT) \ -+ x(bucket_gens_init, 0) \ -+ x(check_snapshot_trees, PASS_FSCK) \ -+ x(check_snapshots, PASS_FSCK) \ -+ x(check_subvols, PASS_FSCK) \ -+ x(delete_dead_snapshots, PASS_FSCK|PASS_UNCLEAN) \ -+ x(fs_upgrade_for_subvolumes, 0) \ -+ x(check_inodes, PASS_FSCK|PASS_UNCLEAN) \ -+ x(check_extents, PASS_FSCK) \ -+ x(check_dirents, PASS_FSCK) \ -+ x(check_xattrs, PASS_FSCK) \ -+ x(check_root, PASS_FSCK) \ -+ x(check_directory_structure, PASS_FSCK) \ -+ x(check_nlinks, PASS_FSCK) \ -+ x(fix_reflink_p, 0) \ -+ -+enum bch_recovery_pass { -+#define x(n, when) BCH_RECOVERY_PASS_##n, -+ BCH_RECOVERY_PASSES() -+#undef x -+}; -+ +struct bch_fs { + struct closure cl; + @@ -8369,6 +8120,7 @@ index 000000000..82b0706a8 + enum bch_recovery_pass curr_recovery_pass; + /* bitmap of explicitly enabled recovery passes: */ + u64 recovery_passes_explicit; ++ u64 recovery_passes_complete; + + /* DEBUG JUNK */ + struct dentry *fs_debug_dir; @@ -8513,32 +8265,16 @@ index 000000000..82b0706a8 + return dev < c->sb.nr_devices && c->devs[dev]; +} + -+/* -+ * For when we need to rewind recovery passes and run a pass we skipped: -+ */ -+static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, -+ enum bch_recovery_pass pass) -+{ -+ c->recovery_passes_explicit |= BIT_ULL(pass); -+ -+ if (c->curr_recovery_pass >= pass) { -+ c->curr_recovery_pass = pass; -+ return -BCH_ERR_restart_recovery; -+ } else { -+ return 0; -+ } -+} -+ +#define BKEY_PADDED_ONSTACK(key, pad) \ + struct { struct bkey_i key; __u64 key ## _pad[pad]; } + +#endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h new file mode 100644 -index 000000000..5c308f842 +index 000000000..f17238be4 --- /dev/null +++ b/fs/bcachefs/bcachefs_format.h -@@ -0,0 +1,2319 @@ +@@ -0,0 +1,2368 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FORMAT_H +#define _BCACHEFS_FORMAT_H @@ -9457,9 +9193,7 @@ index 000000000..5c308f842 +#define DT_SUBVOL 16 +#define BCH_DT_MAX 17 + -+#define BCH_NAME_MAX ((unsigned) (U8_MAX * sizeof(__u64) - \ -+ sizeof(struct bkey) - \ -+ offsetof(struct bch_dirent, d_name))) ++#define BCH_NAME_MAX 512 + +/* Xattrs */ + @@ -9667,6 +9401,11 @@ index 000000000..5c308f842 + __le32 flags; + __le32 snapshot; + __le64 inode; ++ /* ++ * Snapshot subvolumes form a tree, separate from the snapshot nodes ++ * tree - if this subvolume is a snapshot, this is the ID of the ++ * subvolume it was created from: ++ */ + __le32 parent; + __le32 pad; + bch_le128 otime; @@ -9688,6 +9427,7 @@ index 000000000..5c308f842 + __le32 parent; + __le32 children[2]; + __le32 subvol; ++ /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ + __le32 tree; + __le32 depth; + __le32 skip[3]; @@ -10170,7 +9910,9 @@ index 000000000..5c308f842 + x(major_minor, BCH_VERSION(1, 0), \ + 0) \ + x(snapshot_skiplists, BCH_VERSION(1, 1), \ -+ BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) ++ BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \ ++ x(deleted_inodes, BCH_VERSION(1, 2), \ ++ BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) + +enum bcachefs_metadata_version { + bcachefs_metadata_version_min = 9, @@ -10679,7 +10421,7 @@ index 000000000..5c308f842 + __le64 _buckets_unavailable; /* No longer used */ + + struct jset_entry_dev_usage_type d[]; -+} __packed; ++}; + +static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u) +{ @@ -10735,26 +10477,69 @@ index 000000000..5c308f842 + +/* Btree: */ + -+#define BCH_BTREE_IDS() \ -+ x(extents, 0) \ -+ x(inodes, 1) \ -+ x(dirents, 2) \ -+ x(xattrs, 3) \ -+ x(alloc, 4) \ -+ x(quotas, 5) \ -+ x(stripes, 6) \ -+ x(reflink, 7) \ -+ x(subvolumes, 8) \ -+ x(snapshots, 9) \ -+ x(lru, 10) \ -+ x(freespace, 11) \ -+ x(need_discard, 12) \ -+ x(backpointers, 13) \ -+ x(bucket_gens, 14) \ -+ x(snapshot_trees, 15) ++enum btree_id_flags { ++ BTREE_ID_EXTENTS = BIT(0), ++ BTREE_ID_SNAPSHOTS = BIT(1), ++ BTREE_ID_DATA = BIT(2), ++}; ++ ++#define BCH_BTREE_IDS() \ ++ x(extents, 0, BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\ ++ BIT_ULL(KEY_TYPE_whiteout)| \ ++ BIT_ULL(KEY_TYPE_error)| \ ++ BIT_ULL(KEY_TYPE_cookie)| \ ++ BIT_ULL(KEY_TYPE_extent)| \ ++ BIT_ULL(KEY_TYPE_reservation)| \ ++ BIT_ULL(KEY_TYPE_reflink_p)| \ ++ BIT_ULL(KEY_TYPE_inline_data)) \ ++ x(inodes, 1, BTREE_ID_SNAPSHOTS, \ ++ BIT_ULL(KEY_TYPE_whiteout)| \ ++ BIT_ULL(KEY_TYPE_inode)| \ ++ BIT_ULL(KEY_TYPE_inode_v2)| \ ++ BIT_ULL(KEY_TYPE_inode_v3)| \ ++ BIT_ULL(KEY_TYPE_inode_generation)) \ ++ x(dirents, 2, BTREE_ID_SNAPSHOTS, \ ++ BIT_ULL(KEY_TYPE_whiteout)| \ ++ BIT_ULL(KEY_TYPE_hash_whiteout)| \ ++ BIT_ULL(KEY_TYPE_dirent)) \ ++ x(xattrs, 3, BTREE_ID_SNAPSHOTS, \ ++ BIT_ULL(KEY_TYPE_whiteout)| \ ++ BIT_ULL(KEY_TYPE_cookie)| \ ++ BIT_ULL(KEY_TYPE_hash_whiteout)| \ ++ BIT_ULL(KEY_TYPE_xattr)) \ ++ x(alloc, 4, 0, \ ++ BIT_ULL(KEY_TYPE_alloc)| \ ++ BIT_ULL(KEY_TYPE_alloc_v2)| \ ++ BIT_ULL(KEY_TYPE_alloc_v3)| \ ++ BIT_ULL(KEY_TYPE_alloc_v4)) \ ++ x(quotas, 5, 0, \ ++ BIT_ULL(KEY_TYPE_quota)) \ ++ x(stripes, 6, 0, \ ++ BIT_ULL(KEY_TYPE_stripe)) \ ++ x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \ ++ BIT_ULL(KEY_TYPE_reflink_v)| \ ++ BIT_ULL(KEY_TYPE_indirect_inline_data)) \ ++ x(subvolumes, 8, 0, \ ++ BIT_ULL(KEY_TYPE_subvolume)) \ ++ x(snapshots, 9, 0, \ ++ BIT_ULL(KEY_TYPE_snapshot)) \ ++ x(lru, 10, 0, \ ++ BIT_ULL(KEY_TYPE_set)) \ ++ x(freespace, 11, BTREE_ID_EXTENTS, \ ++ BIT_ULL(KEY_TYPE_set)) \ ++ x(need_discard, 12, 0, \ ++ BIT_ULL(KEY_TYPE_set)) \ ++ x(backpointers, 13, 0, \ ++ BIT_ULL(KEY_TYPE_backpointer)) \ ++ x(bucket_gens, 14, 0, \ ++ BIT_ULL(KEY_TYPE_bucket_gens)) \ ++ x(snapshot_trees, 15, 0, \ ++ BIT_ULL(KEY_TYPE_snapshot_tree)) \ ++ x(deleted_inodes, 16, BTREE_ID_SNAPSHOTS, \ ++ BIT_ULL(KEY_TYPE_set)) + +enum btree_id { -+#define x(kwd, val) BTREE_ID_##kwd = val, ++#define x(name, nr, ...) BTREE_ID_##name = nr, + BCH_BTREE_IDS() +#undef x + BTREE_ID_NR @@ -11234,10 +11019,10 @@ index 000000000..f05881f7e +#endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c new file mode 100644 -index 000000000..ee7ba700e +index 000000000..0a5bfe6e9 --- /dev/null +++ b/fs/bcachefs/bkey.c -@@ -0,0 +1,1063 @@ +@@ -0,0 +1,1107 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -11247,14 +11032,6 @@ index 000000000..ee7ba700e +#include "bset.h" +#include "util.h" + -+#undef EBUG_ON -+ -+#ifdef DEBUG_BKEYS -+#define EBUG_ON(cond) BUG_ON(cond) -+#else -+#define EBUG_ON(cond) -+#endif -+ +const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; + +void bch2_bkey_packed_to_binary_text(struct printbuf *out, @@ -11425,6 +11202,28 @@ index 000000000..ee7ba700e +} + +__always_inline ++static void __set_inc_field(struct pack_state *state, unsigned field, u64 v) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ ++ if (bits) { ++ if (bits > state->bits) { ++ bits -= state->bits; ++ /* avoid shift by 64 if bits is 64 - bits is never 0 here: */ ++ state->w |= (v >> 1) >> (bits - 1); ++ ++ *state->p = state->w; ++ state->p = next_word(state->p); ++ state->w = 0; ++ state->bits = 64; ++ } ++ ++ state->bits -= bits; ++ state->w |= v << state->bits; ++ } ++} ++ ++__always_inline +static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) +{ + unsigned bits = state->format->bits_per_field[field]; @@ -11438,20 +11237,7 @@ index 000000000..ee7ba700e + if (fls64(v) > bits) + return false; + -+ if (bits > state->bits) { -+ bits -= state->bits; -+ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ -+ state->w |= (v >> 1) >> (bits - 1); -+ -+ *state->p = state->w; -+ state->p = next_word(state->p); -+ state->w = 0; -+ state->bits = 64; -+ } -+ -+ state->bits -= bits; -+ state->w |= v << state->bits; -+ ++ __set_inc_field(state, field, v); + return true; +} + @@ -11620,19 +11406,7 @@ index 000000000..ee7ba700e + ret = false; + } + -+ if (bits > state->bits) { -+ bits -= state->bits; -+ state->w |= (v >> 1) >> (bits - 1); -+ -+ *state->p = state->w; -+ state->p = next_word(state->p); -+ state->w = 0; -+ state->bits = 64; -+ } -+ -+ state->bits -= bits; -+ state->w |= v << state->bits; -+ ++ __set_inc_field(state, field, v); + return ret; +} + @@ -11675,6 +11449,24 @@ index 000000000..ee7ba700e + + return false; +} ++ ++static bool bkey_format_has_too_big_fields(const struct bkey_format *f) ++{ ++ for (unsigned i = 0; i < f->nr_fields; i++) { ++ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; ++ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); ++ u64 packed_max = f->bits_per_field[i] ++ ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) ++ : 0; ++ u64 field_offset = le64_to_cpu(f->field_offset[i]); ++ ++ if (packed_max + field_offset < packed_max || ++ packed_max + field_offset > unpacked_max) ++ return true; ++ } ++ ++ return false; ++} +#endif + +/* @@ -11755,7 +11547,8 @@ index 000000000..ee7ba700e + + BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); + BUG_ON(bkey_packed_successor(&successor, b, *out) && -+ bkey_cmp_left_packed(b, &successor, &orig) < 0); ++ bkey_cmp_left_packed(b, &successor, &orig) < 0 && ++ !bkey_format_has_too_big_fields(f)); + } +#endif + @@ -11823,8 +11616,10 @@ index 000000000..ee7ba700e + + /* allow for extent merging: */ + if (ret.bits_per_field[BKEY_FIELD_SIZE]) { -+ ret.bits_per_field[BKEY_FIELD_SIZE] += 4; -+ bits += 4; ++ unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]); ++ ++ ret.bits_per_field[BKEY_FIELD_SIZE] += b; ++ bits += b; + } + + ret.key_u64s = DIV_ROUND_UP(bits, 64); @@ -11844,40 +11639,74 @@ index 000000000..ee7ba700e + } + } + -+ EBUG_ON(bch2_bkey_format_validate(&ret)); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ { ++ struct printbuf buf = PRINTBUF; ++ ++ BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf)); ++ printbuf_exit(&buf); ++ } ++#endif + return ret; +} + -+const char *bch2_bkey_format_validate(struct bkey_format *f) ++int bch2_bkey_format_invalid(struct bch_fs *c, ++ struct bkey_format *f, ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + unsigned i, bits = KEY_PACKED_BITS_START; + -+ if (f->nr_fields != BKEY_NR_FIELDS) -+ return "incorrect number of fields"; ++ if (f->nr_fields != BKEY_NR_FIELDS) { ++ prt_printf(err, "incorrect number of fields: got %u, should be %u", ++ f->nr_fields, BKEY_NR_FIELDS); ++ return -BCH_ERR_invalid; ++ } + + /* + * Verify that the packed format can't represent fields larger than the + * unpacked format: + */ + for (i = 0; i < f->nr_fields; i++) { -+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; -+ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); -+ u64 packed_max = f->bits_per_field[i] -+ ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) -+ : 0; -+ u64 field_offset = le64_to_cpu(f->field_offset[i]); ++ if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) { ++ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; ++ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); ++ u64 packed_max = f->bits_per_field[i] ++ ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) ++ : 0; ++ u64 field_offset = le64_to_cpu(f->field_offset[i]); + -+ if (packed_max + field_offset < packed_max || -+ packed_max + field_offset > unpacked_max) -+ return "field too large"; ++ if (packed_max + field_offset < packed_max || ++ packed_max + field_offset > unpacked_max) { ++ prt_printf(err, "field %u too large: %llu + %llu > %llu", ++ i, packed_max, field_offset, unpacked_max); ++ return -BCH_ERR_invalid; ++ } ++ } + + bits += f->bits_per_field[i]; + } + -+ if (f->key_u64s != DIV_ROUND_UP(bits, 64)) -+ return "incorrect key_u64s"; ++ if (f->key_u64s != DIV_ROUND_UP(bits, 64)) { ++ prt_printf(err, "incorrect key_u64s: got %u, should be %u", ++ f->key_u64s, DIV_ROUND_UP(bits, 64)); ++ return -BCH_ERR_invalid; ++ } + -+ return NULL; ++ return 0; ++} ++ ++void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f) ++{ ++ prt_printf(out, "u64s %u fields ", f->key_u64s); ++ ++ for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) { ++ if (i) ++ prt_str(out, ", "); ++ prt_printf(out, "%u:%llu", ++ f->bits_per_field[i], ++ le64_to_cpu(f->field_offset[i])); ++ } +} + +/* @@ -12303,10 +12132,10 @@ index 000000000..ee7ba700e +#endif diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h new file mode 100644 -index 000000000..e81fb3e00 +index 000000000..51969a462 --- /dev/null +++ b/fs/bcachefs/bkey.h -@@ -0,0 +1,774 @@ +@@ -0,0 +1,782 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_H +#define _BCACHEFS_BKEY_H @@ -12318,6 +12147,12 @@ index 000000000..e81fb3e00 +#include "util.h" +#include "vstructs.h" + ++enum bkey_invalid_flags { ++ BKEY_INVALID_WRITE = (1U << 0), ++ BKEY_INVALID_COMMIT = (1U << 1), ++ BKEY_INVALID_JOURNAL = (1U << 2), ++}; ++ +#if 0 + +/* @@ -13078,7 +12913,9 @@ index 000000000..e81fb3e00 + +void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); +struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); -+const char *bch2_bkey_format_validate(struct bkey_format *); ++int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *, ++ enum bkey_invalid_flags, struct printbuf *); ++void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *); + +#endif /* _BCACHEFS_BKEY_H */ diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h @@ -13285,10 +13122,10 @@ index 000000000..5f42a6e69 +#endif /* _BCACHEFS_BKEY_CMP_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c new file mode 100644 -index 000000000..1381166bf +index 000000000..6547142db --- /dev/null +++ b/fs/bcachefs/bkey_methods.c -@@ -0,0 +1,519 @@ +@@ -0,0 +1,456 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -13304,6 +13141,7 @@ index 000000000..1381166bf +#include "lru.h" +#include "quota.h" +#include "reflink.h" ++#include "snapshot.h" +#include "subvolume.h" +#include "xattr.h" + @@ -13431,78 +13269,14 @@ index 000000000..1381166bf + return ops->key_invalid(c, k, flags, err); +} + -+static unsigned bch2_key_types_allowed[] = { -+ [BKEY_TYPE_extents] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_whiteout)| -+ (1U << KEY_TYPE_error)| -+ (1U << KEY_TYPE_cookie)| -+ (1U << KEY_TYPE_extent)| -+ (1U << KEY_TYPE_reservation)| -+ (1U << KEY_TYPE_reflink_p)| -+ (1U << KEY_TYPE_inline_data), -+ [BKEY_TYPE_inodes] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_whiteout)| -+ (1U << KEY_TYPE_inode)| -+ (1U << KEY_TYPE_inode_v2)| -+ (1U << KEY_TYPE_inode_v3)| -+ (1U << KEY_TYPE_inode_generation), -+ [BKEY_TYPE_dirents] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_whiteout)| -+ (1U << KEY_TYPE_hash_whiteout)| -+ (1U << KEY_TYPE_dirent), -+ [BKEY_TYPE_xattrs] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_whiteout)| -+ (1U << KEY_TYPE_cookie)| -+ (1U << KEY_TYPE_hash_whiteout)| -+ (1U << KEY_TYPE_xattr), -+ [BKEY_TYPE_alloc] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_alloc)| -+ (1U << KEY_TYPE_alloc_v2)| -+ (1U << KEY_TYPE_alloc_v3)| -+ (1U << KEY_TYPE_alloc_v4), -+ [BKEY_TYPE_quotas] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_quota), -+ [BKEY_TYPE_stripes] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_stripe), -+ [BKEY_TYPE_reflink] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_reflink_v)| -+ (1U << KEY_TYPE_indirect_inline_data), -+ [BKEY_TYPE_subvolumes] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_subvolume), -+ [BKEY_TYPE_snapshots] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_snapshot), -+ [BKEY_TYPE_lru] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_set), -+ [BKEY_TYPE_freespace] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_set), -+ [BKEY_TYPE_need_discard] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_set), -+ [BKEY_TYPE_backpointers] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_backpointer), -+ [BKEY_TYPE_bucket_gens] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_bucket_gens), -+ [BKEY_TYPE_snapshot_trees] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_snapshot_tree), ++static u64 bch2_key_types_allowed[] = { ++#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys, ++ BCH_BTREE_IDS() ++#undef x + [BKEY_TYPE_btree] = -+ (1U << KEY_TYPE_deleted)| -+ (1U << KEY_TYPE_btree_ptr)| -+ (1U << KEY_TYPE_btree_ptr_v2), ++ BIT_ULL(KEY_TYPE_deleted)| ++ BIT_ULL(KEY_TYPE_btree_ptr)| ++ BIT_ULL(KEY_TYPE_btree_ptr_v2), +}; + +int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, @@ -13516,7 +13290,7 @@ index 000000000..1381166bf + } + + if (flags & BKEY_INVALID_COMMIT && -+ !(bch2_key_types_allowed[type] & (1U << k.k->type))) { ++ !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type))) { + prt_printf(err, "invalid key type for btree %s (%s)", + bch2_btree_ids[type], bch2_bkey_types[k.k->type]); + return -BCH_ERR_invalid_bkey; @@ -13810,10 +13584,10 @@ index 000000000..1381166bf +} diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h new file mode 100644 -index 000000000..f4e60d2e6 +index 000000000..668f595e2 --- /dev/null +++ b/fs/bcachefs/bkey_methods.h -@@ -0,0 +1,193 @@ +@@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_METHODS_H +#define _BCACHEFS_BKEY_METHODS_H @@ -13829,12 +13603,6 @@ index 000000000..f4e60d2e6 +extern const char * const bch2_bkey_types[]; +extern const struct bkey_ops bch2_bkey_null_ops; + -+enum bkey_invalid_flags { -+ BKEY_INVALID_WRITE = (1U << 0), -+ BKEY_INVALID_COMMIT = (1U << 1), -+ BKEY_INVALID_JOURNAL = (1U << 2), -+}; -+ +/* + * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If + * invalid, entire key will be deleted. @@ -13871,11 +13639,12 @@ index 000000000..f4e60d2e6 + : &bch2_bkey_null_ops; +} + -+int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); -+int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, -+ enum btree_node_type, unsigned, struct printbuf *); -+int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, -+ enum btree_node_type, unsigned, struct printbuf *); ++int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); ++int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, ++ enum bkey_invalid_flags, struct printbuf *); ++int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, ++ enum bkey_invalid_flags, struct printbuf *); +int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *); + +void bch2_bpos_to_text(struct printbuf *, struct bpos); @@ -16406,10 +16175,10 @@ index 000000000..632c2b8c5 +#endif /* _BCACHEFS_BSET_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c new file mode 100644 -index 000000000..13c88d953 +index 000000000..a8283fdc7 --- /dev/null +++ b/fs/bcachefs/btree_cache.c -@@ -0,0 +1,1277 @@ +@@ -0,0 +1,1274 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -17626,7 +17395,6 @@ index 000000000..13c88d953 +void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + const struct btree *b) +{ -+ const struct bkey_format *f = &b->format; + struct bset_stats stats; + + memset(&stats, 0, sizeof(stats)); @@ -17640,9 +17408,13 @@ index 000000000..13c88d953 + prt_printf(out, ":\n" + " ptrs: "); + bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ prt_newline(out); + -+ prt_printf(out, "\n" -+ " format: u64s %u fields %u %u %u %u %u\n" ++ prt_printf(out, ++ " format: "); ++ bch2_bkey_format_to_text(out, &b->format); ++ ++ prt_printf(out, + " unpack fn len: %u\n" + " bytes used %zu/%zu (%zu%% full)\n" + " sib u64s: %u, %u (merge threshold %u)\n" @@ -17650,12 +17422,6 @@ index 000000000..13c88d953 + " nr unpacked keys %u\n" + " floats %zu\n" + " failed unpacked %zu\n", -+ f->key_u64s, -+ f->bits_per_field[0], -+ f->bits_per_field[1], -+ f->bits_per_field[2], -+ f->bits_per_field[3], -+ f->bits_per_field[4], + b->unpack_fn_len, + b->nr.live_u64s * sizeof(u64), + btree_bytes(c) - sizeof(struct btree_node), @@ -17825,10 +17591,10 @@ index 000000000..00c9b9218 +#endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c new file mode 100644 -index 000000000..49e9822dd +index 000000000..83dcd9eb2 --- /dev/null +++ b/fs/bcachefs/btree_gc.c -@@ -0,0 +1,2126 @@ +@@ -0,0 +1,2127 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet @@ -17840,6 +17606,7 @@ index 000000000..49e9822dd +#include "alloc_foreground.h" +#include "bkey_methods.h" +#include "bkey_buf.h" ++#include "btree_journal_iter.h" +#include "btree_key_cache.h" +#include "btree_locking.h" +#include "btree_update_interior.h" @@ -17874,7 +17641,7 @@ index 000000000..49e9822dd +static bool should_restart_for_topology_repair(struct bch_fs *c) +{ + return c->opts.fix_errors != FSCK_FIX_no && -+ !(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); ++ !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); +} + +static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) @@ -18366,7 +18133,7 @@ index 000000000..49e9822dd + + bch2_trans_init(&trans, c, 0, 0); + -+ for (i = 0; i < btree_id_nr_alive(c)&& !ret; i++) { ++ for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (!r->alive) @@ -19957,14 +19724,15 @@ index 000000000..49e9822dd +} diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h new file mode 100644 -index 000000000..402c69184 +index 000000000..607575f83 --- /dev/null +++ b/fs/bcachefs/btree_gc.h -@@ -0,0 +1,113 @@ +@@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_GC_H +#define _BCACHEFS_BTREE_GC_H + ++#include "bkey.h" +#include "btree_types.h" + +int bch2_check_topology(struct bch_fs *); @@ -20014,7 +19782,7 @@ index 000000000..402c69184 +static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) +{ + switch (id) { -+#define x(name, v) case BTREE_ID_##name: return GC_PHASE_BTREE_##name; ++#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name; + BCH_BTREE_IDS() +#undef x + default: @@ -20076,10 +19844,10 @@ index 000000000..402c69184 +#endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c new file mode 100644 -index 000000000..c049876ee +index 000000000..cba3c081b --- /dev/null +++ b/fs/bcachefs/btree_io.c -@@ -0,0 +1,2267 @@ +@@ -0,0 +1,2245 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -20099,6 +19867,7 @@ index 000000000..c049876ee +#include "io.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" ++#include "recovery.h" +#include "super-io.h" +#include "trace.h" + @@ -20187,8 +19956,8 @@ index 000000000..c049876ee + vpfree(p, size); +} + -+static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size, -+ bool *used_mempool) ++static void *btree_bounce_alloc(struct bch_fs *c, size_t size, ++ bool *used_mempool) +{ + unsigned flags = memalloc_nofs_save(); + void *p; @@ -20196,7 +19965,7 @@ index 000000000..c049876ee + BUG_ON(size > btree_bytes(c)); + + *used_mempool = false; -+ p = vpmalloc_noprof(size, __GFP_NOWARN|GFP_NOWAIT); ++ p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); + if (!p) { + *used_mempool = true; + p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); @@ -20204,8 +19973,6 @@ index 000000000..c049876ee + memalloc_nofs_restore(flags); + return p; +} -+#define btree_bounce_alloc(_c, _size, _used_mempool) \ -+ alloc_hooks(btree_bounce_alloc_noprof(_c, _size, _used_mempool)) + +static void sort_bkey_ptrs(const struct btree *bt, + struct bkey_packed **ptrs, unsigned nr) @@ -20625,31 +20392,7 @@ index 000000000..c049876ee + prt_str(out, ": "); +} + -+enum btree_err_type { -+ /* -+ * We can repair this locally, and we're after the checksum check so -+ * there's no need to try another replica: -+ */ -+ BTREE_ERR_FIXABLE, -+ /* -+ * We can repair this if we have to, but we should try reading another -+ * replica if we can: -+ */ -+ BTREE_ERR_WANT_RETRY, -+ /* -+ * Read another replica if we have one, otherwise consider the whole -+ * node bad: -+ */ -+ BTREE_ERR_MUST_RETRY, -+ BTREE_ERR_BAD_NODE, -+ BTREE_ERR_INCOMPATIBLE, -+}; -+ -+enum btree_validate_ret { -+ BTREE_RETRY_READ = 64, -+}; -+ -+static int __btree_err(enum btree_err_type type, ++static int __btree_err(int ret, + struct bch_fs *c, + struct bch_dev *ca, + struct btree *b, @@ -20660,7 +20403,6 @@ index 000000000..c049876ee +{ + struct printbuf out = PRINTBUF; + va_list args; -+ int ret = -BCH_ERR_fsck_fix; + + btree_err_msg(&out, c, ca, b, i, b->written, write); + @@ -20676,27 +20418,26 @@ index 000000000..c049876ee + goto out; + } + -+ if (!have_retry && type == BTREE_ERR_WANT_RETRY) -+ type = BTREE_ERR_FIXABLE; -+ if (!have_retry && type == BTREE_ERR_MUST_RETRY) -+ type = BTREE_ERR_BAD_NODE; ++ if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) ++ ret = -BCH_ERR_btree_node_read_err_fixable; ++ if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) ++ ret = -BCH_ERR_btree_node_read_err_bad_node; + -+ switch (type) { -+ case BTREE_ERR_FIXABLE: ++ switch (ret) { ++ case -BCH_ERR_btree_node_read_err_fixable: + mustfix_fsck_err(c, "%s", out.buf); + ret = -BCH_ERR_fsck_fix; + break; -+ case BTREE_ERR_WANT_RETRY: -+ case BTREE_ERR_MUST_RETRY: ++ case -BCH_ERR_btree_node_read_err_want_retry: ++ case -BCH_ERR_btree_node_read_err_must_retry: + bch2_print_string_as_lines(KERN_ERR, out.buf); -+ ret = BTREE_RETRY_READ; + break; -+ case BTREE_ERR_BAD_NODE: ++ case -BCH_ERR_btree_node_read_err_bad_node: + bch2_print_string_as_lines(KERN_ERR, out.buf); + bch2_topology_error(c); + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO; + break; -+ case BTREE_ERR_INCOMPATIBLE: ++ case -BCH_ERR_btree_node_read_err_incompatible: + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = -BCH_ERR_fsck_errors_not_fixed; + break; @@ -20713,8 +20454,11 @@ index 000000000..c049876ee +({ \ + int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\ + \ -+ if (_ret != -BCH_ERR_fsck_fix) \ ++ if (_ret != -BCH_ERR_fsck_fix) { \ ++ ret = _ret; \ + goto fsck_err; \ ++ } \ ++ \ + *saw_error = true; \ +}) + @@ -20778,19 +20522,18 @@ index 000000000..c049876ee + int write, bool have_retry, bool *saw_error) +{ + unsigned version = le16_to_cpu(i->version); -+ const char *err; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + int ret = 0; + + btree_err_on(!bch2_version_compatible(version), -+ BTREE_ERR_INCOMPATIBLE, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i, + "unsupported bset version %u.%u", + BCH_VERSION_MAJOR(version), + BCH_VERSION_MINOR(version)); + + if (btree_err_on(version < c->sb.version_min, -+ BTREE_ERR_FIXABLE, c, NULL, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, + "bset version %u older than superblock version_min %u", + version, c->sb.version_min)) { + mutex_lock(&c->sb_lock); @@ -20801,7 +20544,7 @@ index 000000000..c049876ee + + if (btree_err_on(BCH_VERSION_MAJOR(version) > + BCH_VERSION_MAJOR(c->sb.version), -+ BTREE_ERR_FIXABLE, c, NULL, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, + "bset version %u newer than superblock version %u", + version, c->sb.version)) { + mutex_lock(&c->sb_lock); @@ -20811,11 +20554,11 @@ index 000000000..c049876ee + } + + btree_err_on(BSET_SEPARATE_WHITEOUTS(i), -+ BTREE_ERR_INCOMPATIBLE, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i, + "BSET_SEPARATE_WHITEOUTS no longer supported"); + + if (btree_err_on(offset + sectors > btree_sectors(c), -+ BTREE_ERR_FIXABLE, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, + "bset past end of btree node")) { + i->u64s = 0; + ret = 0; @@ -20823,12 +20566,12 @@ index 000000000..c049876ee + } + + btree_err_on(offset && !i->u64s, -+ BTREE_ERR_FIXABLE, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, + "empty bset"); + + btree_err_on(BSET_OFFSET(i) && + BSET_OFFSET(i) != offset, -+ BTREE_ERR_WANT_RETRY, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, + "bset at wrong sector offset"); + + if (!offset) { @@ -20842,16 +20585,16 @@ index 000000000..c049876ee + + /* XXX endianness */ + btree_err_on(bp->seq != bn->keys.seq, -+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, ++ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, + "incorrect sequence number (wrong btree node)"); + } + + btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, -+ BTREE_ERR_MUST_RETRY, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, + "incorrect btree id"); + + btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, -+ BTREE_ERR_MUST_RETRY, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, + "incorrect level"); + + if (!write) @@ -20868,7 +20611,7 @@ index 000000000..c049876ee + } + + btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), -+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, ++ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, + "incorrect min_key: got %s should be %s", + (printbuf_reset(&buf1), + bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), @@ -20877,7 +20620,7 @@ index 000000000..c049876ee + } + + btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), -+ BTREE_ERR_MUST_RETRY, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, + "incorrect max key %s", + (printbuf_reset(&buf1), + bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); @@ -20886,10 +20629,12 @@ index 000000000..c049876ee + compat_btree_node(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, bn); + -+ err = bch2_bkey_format_validate(&bn->format); -+ btree_err_on(err, -+ BTREE_ERR_BAD_NODE, c, ca, b, i, -+ "invalid bkey format: %s", err); ++ btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), ++ -BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i, ++ "invalid bkey format: %s\n %s", buf1.buf, ++ (printbuf_reset(&buf2), ++ bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); ++ printbuf_reset(&buf1); + + compat_bformat(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, @@ -20929,14 +20674,14 @@ index 000000000..c049876ee + struct bkey tmp; + + if (btree_err_on(bkey_p_next(k) > vstruct_last(i), -+ BTREE_ERR_FIXABLE, c, NULL, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, + "key extends past end of bset")) { + i->u64s = cpu_to_le16((u64 *) k - i->_data); + break; + } + + if (btree_err_on(k->format > KEY_FORMAT_CURRENT, -+ BTREE_ERR_FIXABLE, c, NULL, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, + "invalid bkey format %u", k->format)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_p_next(k), @@ -20960,7 +20705,7 @@ index 000000000..c049876ee + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, u.s_c); + -+ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); ++ btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_p_next(k), @@ -20984,7 +20729,7 @@ index 000000000..c049876ee + + bch2_dump_bset(c, b, i, 0); + -+ if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) { ++ if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_p_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); @@ -21027,16 +20772,16 @@ index 000000000..c049876ee + iter->size = (btree_blocks(c) + 1) * 2; + + if (bch2_meta_read_fault("btree")) -+ btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL, ++ btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, + "dynamic fault"); + + btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), -+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, ++ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, + "bad magic: want %llx, got %llx", + bset_magic(c), le64_to_cpu(b->data->magic)); + + btree_err_on(!b->data->keys.seq, -+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, ++ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, + "bad btree header: seq 0"); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { @@ -21044,7 +20789,7 @@ index 000000000..c049876ee + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + btree_err_on(b->data->keys.seq != bp->seq, -+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL, ++ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, + "got wrong btree node (seq %llx want %llx)", + b->data->keys.seq, bp->seq); + } @@ -21059,7 +20804,7 @@ index 000000000..c049876ee + i = &b->data->keys; + + btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -+ BTREE_ERR_WANT_RETRY, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, + "unknown checksum type %llu", + BSET_CSUM_TYPE(i)); + @@ -21067,7 +20812,7 @@ index 000000000..c049876ee + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + + btree_err_on(bch2_crc_cmp(csum, b->data->csum), -+ BTREE_ERR_WANT_RETRY, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, + "invalid checksum"); + + ret = bset_encrypt(c, i, b->written << 9); @@ -21077,7 +20822,7 @@ index 000000000..c049876ee + + btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), -+ BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL, ++ -BCH_ERR_btree_node_read_err_incompatible, c, NULL, b, NULL, + "btree node does not have NEW_EXTENT_OVERWRITE set"); + + sectors = vstruct_sectors(b->data, c->block_bits); @@ -21089,7 +20834,7 @@ index 000000000..c049876ee + break; + + btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -+ BTREE_ERR_WANT_RETRY, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, + "unknown checksum type %llu", + BSET_CSUM_TYPE(i)); + @@ -21097,7 +20842,7 @@ index 000000000..c049876ee + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + + btree_err_on(bch2_crc_cmp(csum, bne->csum), -+ BTREE_ERR_WANT_RETRY, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, + "invalid checksum"); + + ret = bset_encrypt(c, i, b->written << 9); @@ -21130,12 +20875,12 @@ index 000000000..c049876ee + true); + + btree_err_on(blacklisted && first, -+ BTREE_ERR_FIXABLE, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, + "first btree node bset has blacklisted journal seq (%llu)", + le64_to_cpu(i->journal_seq)); + + btree_err_on(blacklisted && ptr_written, -+ BTREE_ERR_FIXABLE, c, ca, b, i, ++ -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, + "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", + le64_to_cpu(i->journal_seq), + b->written, b->written + sectors, ptr_written); @@ -21154,7 +20899,7 @@ index 000000000..c049876ee + + if (ptr_written) { + btree_err_on(b->written < ptr_written, -+ BTREE_ERR_WANT_RETRY, c, ca, b, NULL, ++ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL, + "btree node data missing: expected %u sectors, found %u", + ptr_written, b->written); + } else { @@ -21165,7 +20910,7 @@ index 000000000..c049876ee + !bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), + true), -+ BTREE_ERR_WANT_RETRY, c, ca, b, NULL, ++ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL, + "found bset signature after last bset"); + + /* @@ -21219,7 +20964,7 @@ index 000000000..c049876ee + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, u.s_c); + -+ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); ++ btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf); + + btree_keys_account_key_drop(&b->nr, 0, k); + @@ -21259,7 +21004,8 @@ index 000000000..c049876ee + printbuf_exit(&buf); + return retry_read; +fsck_err: -+ if (ret == BTREE_RETRY_READ) ++ if (ret == -BCH_ERR_btree_node_read_err_want_retry || ++ ret == -BCH_ERR_btree_node_read_err_must_retry) + retry_read = 1; + else + set_btree_node_read_error(b); @@ -21445,14 +21191,14 @@ index 000000000..c049876ee + } + + written2 = btree_node_sectors_written(c, ra->buf[i]); -+ if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL, ++ if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, + "btree node sectors written mismatch: %u != %u", + written, written2) || + btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), -+ BTREE_ERR_FIXABLE, c, NULL, b, NULL, ++ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, + "found bset signature after last bset") || + btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), -+ BTREE_ERR_FIXABLE, c, NULL, b, NULL, ++ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, + "btree node replicas content mismatch")) + dump_bset_maps = true; + @@ -22349,7 +22095,7 @@ index 000000000..c049876ee +} diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h new file mode 100644 -index 000000000..0cadf651e +index 000000000..cd99bbb00 --- /dev/null +++ b/fs/bcachefs/btree_io.h @@ -0,0 +1,228 @@ @@ -22498,8 +22244,8 @@ index 000000000..0cadf651e + __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS, + __BTREE_WRITE_ALREADY_STARTED, +}; -+#define BTREE_WRITE_ONLY_IF_NEED (1U << __BTREE_WRITE_ONLY_IF_NEED ) -+#define BTREE_WRITE_ALREADY_STARTED (1U << __BTREE_WRITE_ALREADY_STARTED) ++#define BTREE_WRITE_ONLY_IF_NEED BIT(__BTREE_WRITE_ONLY_IF_NEED) ++#define BTREE_WRITE_ALREADY_STARTED BIT(__BTREE_WRITE_ALREADY_STARTED) + +void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); +void bch2_btree_node_write(struct bch_fs *, struct btree *, @@ -22583,10 +22329,10 @@ index 000000000..0cadf651e +#endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c new file mode 100644 -index 000000000..e292c5a2a +index 000000000..21c2bc8a8 --- /dev/null +++ b/fs/bcachefs/btree_iter.c -@@ -0,0 +1,3214 @@ +@@ -0,0 +1,3194 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -22594,6 +22340,7 @@ index 000000000..e292c5a2a +#include "bkey_buf.h" +#include "btree_cache.h" +#include "btree_iter.h" ++#include "btree_journal_iter.h" +#include "btree_key_cache.h" +#include "btree_locking.h" +#include "btree_update.h" @@ -22601,9 +22348,8 @@ index 000000000..e292c5a2a +#include "error.h" +#include "extents.h" +#include "journal.h" -+#include "recovery.h" +#include "replicas.h" -+#include "subvolume.h" ++#include "snapshot.h" +#include "trace.h" + +#include @@ -22624,18 +22370,6 @@ index 000000000..e292c5a2a + +static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); + -+/* -+ * Unlocks before scheduling -+ * Note: does not revalidate iterator -+ */ -+static inline int bch2_trans_cond_resched(struct btree_trans *trans) -+{ -+ if (need_resched() || race_fault()) -+ return drop_locks_do(trans, (schedule(), 0)); -+ else -+ return 0; -+} -+ +static inline int __btree_path_cmp(const struct btree_path *l, + enum btree_id r_btree_id, + bool r_cached, @@ -23609,7 +23343,7 @@ index 000000000..e292c5a2a + /* + * We used to assert that all paths had been traversed here + * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since -+ * path->Should_be_locked is not set yet, we we might have unlocked and ++ * path->should_be_locked is not set yet, we might have unlocked and + * then failed to relock a path - that's fine. + */ +err: @@ -23942,14 +23676,14 @@ index 000000000..e292c5a2a + __bch2_path_free(trans, path); +} + -+void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) ++void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) +{ + panic("trans->restart_count %u, should be %u, last restarted by %pS\n", + trans->restart_count, restart_count, + (void *) trans->last_begin_ip); +} + -+void bch2_trans_in_restart_error(struct btree_trans *trans) ++void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) +{ + panic("in transaction restart: %s, last restarted by %pS\n", + bch2_err_str(trans->restarted), @@ -25321,19 +25055,9 @@ index 000000000..e292c5a2a + iter->key_cache_path = NULL; +} + -+static inline void bch2_trans_iter_init_inlined(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned btree_id, struct bpos pos, -+ unsigned flags) -+{ -+ bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, -+ bch2_btree_iter_flags(trans, btree_id, flags), -+ _RET_IP_); -+} -+ +void bch2_trans_iter_init_outlined(struct btree_trans *trans, + struct btree_iter *iter, -+ unsigned btree_id, struct bpos pos, ++ enum btree_id btree_id, struct bpos pos, + unsigned flags) +{ + bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, @@ -25349,9 +25073,9 @@ index 000000000..e292c5a2a + unsigned depth, + unsigned flags) +{ -+ flags |= BTREE_ITER_NOT_EXTENTS; -+ flags |= __BTREE_ITER_ALL_SNAPSHOTS; -+ flags |= BTREE_ITER_ALL_SNAPSHOTS; ++ flags |= BTREE_ITER_NOT_EXTENTS; ++ flags |= __BTREE_ITER_ALL_SNAPSHOTS; ++ flags |= BTREE_ITER_ALL_SNAPSHOTS; + + bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, + __bch2_btree_iter_flags(trans, btree_id, flags), @@ -25509,12 +25233,14 @@ index 000000000..e292c5a2a +#ifdef __KERNEL__ + p = this_cpu_xchg(c->btree_paths_bufs->path, NULL); +#endif -+ if (!p) ++ if (!p) { + p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS); -+ /* -+ * paths need to be zeroed, bch2_check_for_deadlock looks at paths in -+ * other threads -+ */ ++ /* ++ * paths need to be zeroed, bch2_check_for_deadlock looks at ++ * paths in other threads ++ */ ++ memset(p, 0, paths_bytes); ++ } + + trans->paths = p; p += paths_bytes; + trans->updates = p; p += updates_bytes; @@ -25803,10 +25529,10 @@ index 000000000..e292c5a2a +} diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h new file mode 100644 -index 000000000..c472aa8c5 +index 000000000..4469b2e16 --- /dev/null +++ b/fs/bcachefs/btree_iter.h -@@ -0,0 +1,924 @@ +@@ -0,0 +1,940 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_ITER_H +#define _BCACHEFS_BTREE_ITER_H @@ -26030,6 +25756,22 @@ index 000000000..c472aa8c5 + unsigned, unsigned, unsigned, unsigned long); +struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); + ++/* ++ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a ++ * different snapshot: ++ */ ++static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) ++{ ++ struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); ++ ++ if (k.k && bpos_eq(path->pos, k.k->p)) ++ return k; ++ ++ bkey_init(u); ++ u->p = path->pos; ++ return (struct bkey_s_c) { u, NULL }; ++} ++ +struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, + struct btree_iter *, struct bpos); + @@ -26074,7 +25816,7 @@ index 000000000..c472aa8c5 + return restart_count != trans->restart_count; +} + -+void bch2_trans_restart_error(struct btree_trans *, u32); ++void __noreturn bch2_trans_restart_error(struct btree_trans *, u32); + +static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, + u32 restart_count) @@ -26083,7 +25825,7 @@ index 000000000..c472aa8c5 + bch2_trans_restart_error(trans, restart_count); +} + -+void bch2_trans_in_restart_error(struct btree_trans *); ++void __noreturn bch2_trans_in_restart_error(struct btree_trans *); + +static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) +{ @@ -26256,7 +25998,7 @@ index 000000000..c472aa8c5 +} + +void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *, -+ unsigned, struct bpos, unsigned); ++ enum btree_id, struct bpos, unsigned); + +static inline void bch2_trans_iter_init(struct btree_trans *trans, + struct btree_iter *iter, @@ -26731,6 +26473,606 @@ index 000000000..c472aa8c5 +int bch2_fs_btree_iter_init(struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_ITER_H */ +diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c +new file mode 100644 +index 000000000..58a981bcf +--- /dev/null ++++ b/fs/bcachefs/btree_journal_iter.c +@@ -0,0 +1,531 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bset.h" ++#include "btree_journal_iter.h" ++#include "journal_io.h" ++ ++#include ++ ++/* ++ * For managing keys we read from the journal: until journal replay works normal ++ * btree lookups need to be able to find and return keys from the journal where ++ * they overwrite what's in the btree, so we have a special iterator and ++ * operations for the regular btree iter code to use: ++ */ ++ ++static int __journal_key_cmp(enum btree_id l_btree_id, ++ unsigned l_level, ++ struct bpos l_pos, ++ const struct journal_key *r) ++{ ++ return (cmp_int(l_btree_id, r->btree_id) ?: ++ cmp_int(l_level, r->level) ?: ++ bpos_cmp(l_pos, r->k->k.p)); ++} ++ ++static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) ++{ ++ return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); ++} ++ ++static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) ++{ ++ size_t gap_size = keys->size - keys->nr; ++ ++ if (idx >= keys->gap) ++ idx += gap_size; ++ return idx; ++} ++ ++static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) ++{ ++ return keys->d + idx_to_pos(keys, idx); ++} ++ ++static size_t __bch2_journal_key_search(struct journal_keys *keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) ++{ ++ size_t l = 0, r = keys->nr, m; ++ ++ while (l < r) { ++ m = l + ((r - l) >> 1); ++ if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) ++ l = m + 1; ++ else ++ r = m; ++ } ++ ++ BUG_ON(l < keys->nr && ++ __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); ++ ++ BUG_ON(l && ++ __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); ++ ++ return l; ++} ++ ++static size_t bch2_journal_key_search(struct journal_keys *keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) ++{ ++ return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); ++} ++ ++struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, ++ unsigned level, struct bpos pos, ++ struct bpos end_pos, size_t *idx) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ unsigned iters = 0; ++ struct journal_key *k; ++search: ++ if (!*idx) ++ *idx = __bch2_journal_key_search(keys, btree_id, level, pos); ++ ++ while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { ++ if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) ++ return NULL; ++ ++ if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && ++ !k->overwritten) ++ return k->k; ++ ++ (*idx)++; ++ iters++; ++ if (iters == 10) { ++ *idx = 0; ++ goto search; ++ } ++ } ++ ++ return NULL; ++} ++ ++struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, ++ unsigned level, struct bpos pos) ++{ ++ size_t idx = 0; ++ ++ return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); ++} ++ ++static void journal_iters_fix(struct bch_fs *c) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ /* The key we just inserted is immediately before the gap: */ ++ size_t gap_end = keys->gap + (keys->size - keys->nr); ++ struct btree_and_journal_iter *iter; ++ ++ /* ++ * If an iterator points one after the key we just inserted, decrement ++ * the iterator so it points at the key we just inserted - if the ++ * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will ++ * handle that: ++ */ ++ list_for_each_entry(iter, &c->journal_iters, journal.list) ++ if (iter->journal.idx == gap_end) ++ iter->journal.idx = keys->gap - 1; ++} ++ ++static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ struct journal_iter *iter; ++ size_t gap_size = keys->size - keys->nr; ++ ++ list_for_each_entry(iter, &c->journal_iters, list) { ++ if (iter->idx > old_gap) ++ iter->idx -= gap_size; ++ if (iter->idx >= new_gap) ++ iter->idx += gap_size; ++ } ++} ++ ++int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_i *k) ++{ ++ struct journal_key n = { ++ .btree_id = id, ++ .level = level, ++ .k = k, ++ .allocated = true, ++ /* ++ * Ensure these keys are done last by journal replay, to unblock ++ * journal reclaim: ++ */ ++ .journal_seq = U32_MAX, ++ }; ++ struct journal_keys *keys = &c->journal_keys; ++ size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); ++ ++ BUG_ON(test_bit(BCH_FS_RW, &c->flags)); ++ ++ if (idx < keys->size && ++ journal_key_cmp(&n, &keys->d[idx]) == 0) { ++ if (keys->d[idx].allocated) ++ kfree(keys->d[idx].k); ++ keys->d[idx] = n; ++ return 0; ++ } ++ ++ if (idx > keys->gap) ++ idx -= keys->size - keys->nr; ++ ++ if (keys->nr == keys->size) { ++ struct journal_keys new_keys = { ++ .nr = keys->nr, ++ .size = max_t(size_t, keys->size, 8) * 2, ++ }; ++ ++ new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); ++ if (!new_keys.d) { ++ bch_err(c, "%s: error allocating new key array (size %zu)", ++ __func__, new_keys.size); ++ return -BCH_ERR_ENOMEM_journal_key_insert; ++ } ++ ++ /* Since @keys was full, there was no gap: */ ++ memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); ++ kvfree(keys->d); ++ *keys = new_keys; ++ ++ /* And now the gap is at the end: */ ++ keys->gap = keys->nr; ++ } ++ ++ journal_iters_move_gap(c, keys->gap, idx); ++ ++ move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); ++ keys->gap = idx; ++ ++ keys->nr++; ++ keys->d[keys->gap++] = n; ++ ++ journal_iters_fix(c); ++ ++ return 0; ++} ++ ++/* ++ * Can only be used from the recovery thread while we're still RO - can't be ++ * used once we've got RW, as journal_keys is at that point used by multiple ++ * threads: ++ */ ++int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_i *k) ++{ ++ struct bkey_i *n; ++ int ret; ++ ++ n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); ++ if (!n) ++ return -BCH_ERR_ENOMEM_journal_key_insert; ++ ++ bkey_copy(n, k); ++ ret = bch2_journal_key_insert_take(c, id, level, n); ++ if (ret) ++ kfree(n); ++ return ret; ++} ++ ++int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bpos pos) ++{ ++ struct bkey_i whiteout; ++ ++ bkey_init(&whiteout.k); ++ whiteout.k.p = pos; ++ ++ return bch2_journal_key_insert(c, id, level, &whiteout); ++} ++ ++void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, ++ unsigned level, struct bpos pos) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ size_t idx = bch2_journal_key_search(keys, btree, level, pos); ++ ++ if (idx < keys->size && ++ keys->d[idx].btree_id == btree && ++ keys->d[idx].level == level && ++ bpos_eq(keys->d[idx].k->k.p, pos)) ++ keys->d[idx].overwritten = true; ++} ++ ++static void bch2_journal_iter_advance(struct journal_iter *iter) ++{ ++ if (iter->idx < iter->keys->size) { ++ iter->idx++; ++ if (iter->idx == iter->keys->gap) ++ iter->idx += iter->keys->size - iter->keys->nr; ++ } ++} ++ ++static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) ++{ ++ struct journal_key *k = iter->keys->d + iter->idx; ++ ++ while (k < iter->keys->d + iter->keys->size && ++ k->btree_id == iter->btree_id && ++ k->level == iter->level) { ++ if (!k->overwritten) ++ return bkey_i_to_s_c(k->k); ++ ++ bch2_journal_iter_advance(iter); ++ k = iter->keys->d + iter->idx; ++ } ++ ++ return bkey_s_c_null; ++} ++ ++static void bch2_journal_iter_exit(struct journal_iter *iter) ++{ ++ list_del(&iter->list); ++} ++ ++static void bch2_journal_iter_init(struct bch_fs *c, ++ struct journal_iter *iter, ++ enum btree_id id, unsigned level, ++ struct bpos pos) ++{ ++ iter->btree_id = id; ++ iter->level = level; ++ iter->keys = &c->journal_keys; ++ iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); ++} ++ ++static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) ++{ ++ return bch2_btree_node_iter_peek_unpack(&iter->node_iter, ++ iter->b, &iter->unpacked); ++} ++ ++static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) ++{ ++ bch2_btree_node_iter_advance(&iter->node_iter, iter->b); ++} ++ ++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) ++{ ++ if (bpos_eq(iter->pos, SPOS_MAX)) ++ iter->at_end = true; ++ else ++ iter->pos = bpos_successor(iter->pos); ++} ++ ++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) ++{ ++ struct bkey_s_c btree_k, journal_k, ret; ++again: ++ if (iter->at_end) ++ return bkey_s_c_null; ++ ++ while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && ++ bpos_lt(btree_k.k->p, iter->pos)) ++ bch2_journal_iter_advance_btree(iter); ++ ++ while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && ++ bpos_lt(journal_k.k->p, iter->pos)) ++ bch2_journal_iter_advance(&iter->journal); ++ ++ ret = journal_k.k && ++ (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) ++ ? journal_k ++ : btree_k; ++ ++ if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) ++ ret = bkey_s_c_null; ++ ++ if (ret.k) { ++ iter->pos = ret.k->p; ++ if (bkey_deleted(ret.k)) { ++ bch2_btree_and_journal_iter_advance(iter); ++ goto again; ++ } ++ } else { ++ iter->pos = SPOS_MAX; ++ iter->at_end = true; ++ } ++ ++ return ret; ++} ++ ++void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) ++{ ++ bch2_journal_iter_exit(&iter->journal); ++} ++ ++void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, ++ struct bch_fs *c, ++ struct btree *b, ++ struct btree_node_iter node_iter, ++ struct bpos pos) ++{ ++ memset(iter, 0, sizeof(*iter)); ++ ++ iter->b = b; ++ iter->node_iter = node_iter; ++ bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); ++ INIT_LIST_HEAD(&iter->journal.list); ++ iter->pos = b->data->min_key; ++ iter->at_end = false; ++} ++ ++/* ++ * this version is used by btree_gc before filesystem has gone RW and ++ * multithreaded, so uses the journal_iters list: ++ */ ++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, ++ struct bch_fs *c, ++ struct btree *b) ++{ ++ struct btree_node_iter node_iter; ++ ++ bch2_btree_node_iter_init_from_start(&node_iter, b); ++ __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); ++ list_add(&iter->journal.list, &c->journal_iters); ++} ++ ++/* sort and dedup all keys in the journal: */ ++ ++void bch2_journal_entries_free(struct bch_fs *c) ++{ ++ struct journal_replay **i; ++ struct genradix_iter iter; ++ ++ genradix_for_each(&c->journal_entries, iter, i) ++ if (*i) ++ kvpfree(*i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&(*i)->j)); ++ genradix_free(&c->journal_entries); ++} ++ ++/* ++ * When keys compare equal, oldest compares first: ++ */ ++static int journal_sort_key_cmp(const void *_l, const void *_r) ++{ ++ const struct journal_key *l = _l; ++ const struct journal_key *r = _r; ++ ++ return journal_key_cmp(l, r) ?: ++ cmp_int(l->journal_seq, r->journal_seq) ?: ++ cmp_int(l->journal_offset, r->journal_offset); ++} ++ ++void bch2_journal_keys_free(struct journal_keys *keys) ++{ ++ struct journal_key *i; ++ ++ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); ++ keys->gap = keys->nr; ++ ++ for (i = keys->d; i < keys->d + keys->nr; i++) ++ if (i->allocated) ++ kfree(i->k); ++ ++ kvfree(keys->d); ++ keys->d = NULL; ++ keys->nr = keys->gap = keys->size = 0; ++} ++ ++static void __journal_keys_sort(struct journal_keys *keys) ++{ ++ struct journal_key *src, *dst; ++ ++ sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); ++ ++ src = dst = keys->d; ++ while (src < keys->d + keys->nr) { ++ while (src + 1 < keys->d + keys->nr && ++ src[0].btree_id == src[1].btree_id && ++ src[0].level == src[1].level && ++ bpos_eq(src[0].k->k.p, src[1].k->k.p)) ++ src++; ++ ++ *dst++ = *src++; ++ } ++ ++ keys->nr = dst - keys->d; ++} ++ ++int bch2_journal_keys_sort(struct bch_fs *c) ++{ ++ struct genradix_iter iter; ++ struct journal_replay *i, **_i; ++ struct jset_entry *entry; ++ struct bkey_i *k; ++ struct journal_keys *keys = &c->journal_keys; ++ size_t nr_keys = 0, nr_read = 0; ++ ++ genradix_for_each(&c->journal_entries, iter, _i) { ++ i = *_i; ++ ++ if (!i || i->ignore) ++ continue; ++ ++ for_each_jset_key(k, entry, &i->j) ++ nr_keys++; ++ } ++ ++ if (!nr_keys) ++ return 0; ++ ++ keys->size = roundup_pow_of_two(nr_keys); ++ ++ keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); ++ if (!keys->d) { ++ bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", ++ nr_keys); ++ ++ do { ++ keys->size >>= 1; ++ keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); ++ } while (!keys->d && keys->size > nr_keys / 8); ++ ++ if (!keys->d) { ++ bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", ++ keys->size); ++ return -BCH_ERR_ENOMEM_journal_keys_sort; ++ } ++ } ++ ++ genradix_for_each(&c->journal_entries, iter, _i) { ++ i = *_i; ++ ++ if (!i || i->ignore) ++ continue; ++ ++ cond_resched(); ++ ++ for_each_jset_key(k, entry, &i->j) { ++ if (keys->nr == keys->size) { ++ __journal_keys_sort(keys); ++ ++ if (keys->nr > keys->size * 7 / 8) { ++ bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", ++ keys->nr, keys->size, nr_read, nr_keys); ++ return -BCH_ERR_ENOMEM_journal_keys_sort; ++ } ++ } ++ ++ keys->d[keys->nr++] = (struct journal_key) { ++ .btree_id = entry->btree_id, ++ .level = entry->level, ++ .k = k, ++ .journal_seq = le64_to_cpu(i->j.seq), ++ .journal_offset = k->_data - i->j._data, ++ }; ++ ++ nr_read++; ++ } ++ } ++ ++ __journal_keys_sort(keys); ++ keys->gap = keys->nr; ++ ++ bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); ++ return 0; ++} +diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h +new file mode 100644 +index 000000000..5d64e7e22 +--- /dev/null ++++ b/fs/bcachefs/btree_journal_iter.h +@@ -0,0 +1,57 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H ++#define _BCACHEFS_BTREE_JOURNAL_ITER_H ++ ++struct journal_iter { ++ struct list_head list; ++ enum btree_id btree_id; ++ unsigned level; ++ size_t idx; ++ struct journal_keys *keys; ++}; ++ ++/* ++ * Iterate over keys in the btree, with keys from the journal overlaid on top: ++ */ ++ ++struct btree_and_journal_iter { ++ struct btree *b; ++ struct btree_node_iter node_iter; ++ struct bkey unpacked; ++ ++ struct journal_iter journal; ++ struct bpos pos; ++ bool at_end; ++}; ++ ++struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, ++ unsigned, struct bpos, struct bpos, size_t *); ++struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, ++ unsigned, struct bpos); ++ ++int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, ++ unsigned, struct bkey_i *); ++int bch2_journal_key_insert(struct bch_fs *, enum btree_id, ++ unsigned, struct bkey_i *); ++int bch2_journal_key_delete(struct bch_fs *, enum btree_id, ++ unsigned, struct bpos); ++void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, ++ unsigned, struct bpos); ++ ++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); ++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); ++ ++void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); ++void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, ++ struct bch_fs *, struct btree *, ++ struct btree_node_iter, struct bpos); ++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, ++ struct bch_fs *, ++ struct btree *); ++ ++void bch2_journal_keys_free(struct journal_keys *); ++void bch2_journal_entries_free(struct bch_fs *); ++ ++int bch2_journal_keys_sort(struct bch_fs *); ++ ++#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c new file mode 100644 index 000000000..f7c001d42 @@ -27881,7 +28223,7 @@ index 000000000..be3acde2c +#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c new file mode 100644 -index 000000000..d7fd87149 +index 000000000..0b0f9d607 --- /dev/null +++ b/fs/bcachefs/btree_locking.c @@ -0,0 +1,797 @@ @@ -28274,7 +28616,7 @@ index 000000000..d7fd87149 + six_lock_readers_add(&b->lock, readers); + + if (ret) -+ mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent); ++ mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED); + + return ret; +} @@ -28438,7 +28780,7 @@ index 000000000..d7fd87149 + trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level); + return false; +success: -+ mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent); ++ mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); + return true; +} + @@ -28553,7 +28895,7 @@ index 000000000..d7fd87149 + } else { + if (btree_node_intent_locked(path, l)) { + six_lock_downgrade(&path->l[l].b->c.lock); -+ mark_btree_node_locked_noreset(path, l, SIX_LOCK_read); ++ mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED); + } + break; + } @@ -28684,10 +29026,10 @@ index 000000000..d7fd87149 +#endif diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h new file mode 100644 -index 000000000..f3e58aa27 +index 000000000..22e2cd391 --- /dev/null +++ b/fs/bcachefs/btree_locking.h -@@ -0,0 +1,424 @@ +@@ -0,0 +1,423 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_LOCKING_H +#define _BCACHEFS_BTREE_LOCKING_H @@ -28700,9 +29042,8 @@ index 000000000..f3e58aa27 + * updating the iterator state + */ + -+#include -+ +#include "btree_iter.h" ++#include "six.h" + +void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); + @@ -28871,7 +29212,7 @@ index 000000000..f3e58aa27 + EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); + EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); + -+ mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent); ++ mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); + + trans_for_each_path_with_node(trans, b, linked) + linked->l[b->c.level].lock_seq++; @@ -28984,7 +29325,7 @@ index 000000000..f3e58aa27 + * write lock: thus, we need to tell the cycle detector we have a write + * lock _before_ taking the lock: + */ -+ mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write); ++ mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED); + + return likely(six_trylock_write(&b->lock)) + ? 0 @@ -29112,19 +29453,1180 @@ index 000000000..f3e58aa27 +#endif + +#endif /* _BCACHEFS_BTREE_LOCKING_H */ +diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c +new file mode 100644 +index 000000000..eafb0388e +--- /dev/null ++++ b/fs/bcachefs/btree_trans_commit.c +@@ -0,0 +1,1156 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_gc.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_journal_iter.h" ++#include "btree_key_cache.h" ++#include "btree_update_interior.h" ++#include "btree_write_buffer.h" ++#include "buckets.h" ++#include "errcode.h" ++#include "error.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "replicas.h" ++#include "snapshot.h" ++ ++#include ++ ++static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bch_fs *c = trans->c; ++ struct bkey u; ++ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); ++ ++ if (unlikely(trans->journal_replay_not_finished)) { ++ struct bkey_i *j_k = ++ bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); ++ ++ if (j_k) ++ k = bkey_i_to_s_c(j_k); ++ } ++ ++ u = *k.k; ++ u.needs_whiteout = i->old_k.needs_whiteout; ++ ++ BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); ++ BUG_ON(i->old_v != k.v); ++#endif ++} ++ ++static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) ++{ ++ return i->path->l + i->level; ++} ++ ++static inline bool same_leaf_as_prev(struct btree_trans *trans, ++ struct btree_insert_entry *i) ++{ ++ return i != trans->updates && ++ insert_l(&i[0])->b == insert_l(&i[-1])->b; ++} ++ ++static inline bool same_leaf_as_next(struct btree_trans *trans, ++ struct btree_insert_entry *i) ++{ ++ return i + 1 < trans->updates + trans->nr_updates && ++ insert_l(&i[0])->b == insert_l(&i[1])->b; ++} ++ ++inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b) ++{ ++ struct bch_fs *c = trans->c; ++ ++ if (unlikely(btree_node_just_written(b)) && ++ bch2_btree_post_write_cleanup(c, b)) ++ bch2_trans_node_reinit_iter(trans, b); ++ ++ /* ++ * If the last bset has been written, or if it's gotten too big - start ++ * a new bset to insert into: ++ */ ++ if (want_new_bset(c, b)) ++ bch2_btree_init_next(trans, b); ++} ++ ++/* Inserting into a given leaf node (last stage of insert): */ ++ ++/* Handle overwrites and do insert, for non extents: */ ++bool bch2_btree_bset_insert_key(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bkey_i *insert) ++{ ++ struct bkey_packed *k; ++ unsigned clobber_u64s = 0, new_u64s = 0; ++ ++ EBUG_ON(btree_node_just_written(b)); ++ EBUG_ON(bset_written(b, btree_bset_last(b))); ++ EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); ++ EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); ++ EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); ++ EBUG_ON(insert->k.u64s > ++ bch_btree_keys_u64s_remaining(trans->c, b)); ++ EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); ++ ++ k = bch2_btree_node_iter_peek_all(node_iter, b); ++ if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) ++ k = NULL; ++ ++ /* @k is the key being overwritten/deleted, if any: */ ++ EBUG_ON(k && bkey_deleted(k)); ++ ++ /* Deleting, but not found? nothing to do: */ ++ if (bkey_deleted(&insert->k) && !k) ++ return false; ++ ++ if (bkey_deleted(&insert->k)) { ++ /* Deleting: */ ++ btree_account_key_drop(b, k); ++ k->type = KEY_TYPE_deleted; ++ ++ if (k->needs_whiteout) ++ push_whiteout(trans->c, b, insert->k.p); ++ k->needs_whiteout = false; ++ ++ if (k >= btree_bset_last(b)->start) { ++ clobber_u64s = k->u64s; ++ bch2_bset_delete(b, k, clobber_u64s); ++ goto fix_iter; ++ } else { ++ bch2_btree_path_fix_key_modified(trans, b, k); ++ } ++ ++ return true; ++ } ++ ++ if (k) { ++ /* Overwriting: */ ++ btree_account_key_drop(b, k); ++ k->type = KEY_TYPE_deleted; ++ ++ insert->k.needs_whiteout = k->needs_whiteout; ++ k->needs_whiteout = false; ++ ++ if (k >= btree_bset_last(b)->start) { ++ clobber_u64s = k->u64s; ++ goto overwrite; ++ } else { ++ bch2_btree_path_fix_key_modified(trans, b, k); ++ } ++ } ++ ++ k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); ++overwrite: ++ bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); ++ new_u64s = k->u64s; ++fix_iter: ++ if (clobber_u64s != new_u64s) ++ bch2_btree_node_iter_fix(trans, path, b, node_iter, k, ++ clobber_u64s, new_u64s); ++ return true; ++} ++ ++static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, ++ unsigned i, u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct btree_write *w = container_of(pin, struct btree_write, journal); ++ struct btree *b = container_of(w, struct btree, writes[i]); ++ struct btree_trans trans; ++ unsigned long old, new, v; ++ unsigned idx = w - b->writes; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); ++ v = READ_ONCE(b->flags); ++ ++ do { ++ old = new = v; ++ ++ if (!(old & (1 << BTREE_NODE_dirty)) || ++ !!(old & (1 << BTREE_NODE_write_idx)) != idx || ++ w->journal.seq != seq) ++ break; ++ ++ new &= ~BTREE_WRITE_TYPE_MASK; ++ new |= BTREE_WRITE_journal_reclaim; ++ new |= 1 << BTREE_NODE_need_write; ++ } while ((v = cmpxchg(&b->flags, old, new)) != old); ++ ++ btree_node_write_if_need(c, b, SIX_LOCK_read); ++ six_unlock_read(&b->c.lock); ++ ++ bch2_trans_exit(&trans); ++ return 0; ++} ++ ++int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) ++{ ++ return __btree_node_flush(j, pin, 0, seq); ++} ++ ++int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) ++{ ++ return __btree_node_flush(j, pin, 1, seq); ++} ++ ++inline void bch2_btree_add_journal_pin(struct bch_fs *c, ++ struct btree *b, u64 seq) ++{ ++ struct btree_write *w = btree_current_write(b); ++ ++ bch2_journal_pin_add(&c->journal, seq, &w->journal, ++ btree_node_write_idx(b) == 0 ++ ? bch2_btree_node_flush0 ++ : bch2_btree_node_flush1); ++} ++ ++/** ++ * btree_insert_key - insert a key one key into a leaf node ++ */ ++inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, ++ struct btree_path *path, ++ struct bkey_i *insert, ++ u64 journal_seq) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b = path_l(path)->b; ++ struct bset_tree *t = bset_tree_last(b); ++ struct bset *i = bset(b, t); ++ int old_u64s = bset_u64s(t); ++ int old_live_u64s = b->nr.live_u64s; ++ int live_u64s_added, u64s_added; ++ ++ if (unlikely(!bch2_btree_bset_insert_key(trans, path, b, ++ &path_l(path)->iter, insert))) ++ return; ++ ++ i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq))); ++ ++ bch2_btree_add_journal_pin(c, b, journal_seq); ++ ++ if (unlikely(!btree_node_dirty(b))) { ++ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); ++ set_btree_node_dirty_acct(c, b); ++ } ++ ++ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; ++ u64s_added = (int) bset_u64s(t) - old_u64s; ++ ++ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); ++ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); ++ ++ if (u64s_added > live_u64s_added && ++ bch2_maybe_compact_whiteouts(c, b)) ++ bch2_trans_node_reinit_iter(trans, b); ++} ++ ++/* Cached btree updates: */ ++ ++/* Normal update interface: */ ++ ++static inline void btree_insert_entry_checks(struct btree_trans *trans, ++ struct btree_insert_entry *i) ++{ ++ BUG_ON(!bpos_eq(i->k->k.p, i->path->pos)); ++ BUG_ON(i->cached != i->path->cached); ++ BUG_ON(i->level != i->path->level); ++ BUG_ON(i->btree_id != i->path->btree_id); ++ EBUG_ON(!i->level && ++ !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && ++ test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && ++ i->k->k.p.snapshot && ++ bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); ++} ++ ++static noinline int ++bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags, ++ unsigned long trace_ip) ++{ ++ return drop_locks_do(trans, ++ bch2_journal_preres_get(&trans->c->journal, ++ &trans->journal_preres, ++ trans->journal_preres_u64s, ++ (flags & BCH_WATERMARK_MASK))); ++} ++ ++static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, ++ unsigned flags) ++{ ++ return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, ++ trans->journal_u64s, flags); ++} ++ ++#define JSET_ENTRY_LOG_U64s 4 ++ ++static noinline void journal_transaction_name(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct journal *j = &c->journal; ++ struct jset_entry *entry = ++ bch2_journal_add_entry(j, &trans->journal_res, ++ BCH_JSET_ENTRY_log, 0, 0, ++ JSET_ENTRY_LOG_U64s); ++ struct jset_entry_log *l = ++ container_of(entry, struct jset_entry_log, entry); ++ ++ strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); ++} ++ ++static inline int btree_key_can_insert(struct btree_trans *trans, ++ struct btree *b, unsigned u64s) ++{ ++ struct bch_fs *c = trans->c; ++ ++ if (!bch2_btree_node_insert_fits(c, b, u64s)) ++ return -BCH_ERR_btree_insert_btree_node_full; ++ ++ return 0; ++} ++ ++static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, ++ struct btree_path *path, unsigned u64s) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_cached *ck = (void *) path->l[0].b; ++ struct btree_insert_entry *i; ++ unsigned new_u64s; ++ struct bkey_i *new_k; ++ ++ EBUG_ON(path->level); ++ ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && ++ bch2_btree_key_cache_must_wait(c) && ++ !(flags & BTREE_INSERT_JOURNAL_RECLAIM)) ++ return -BCH_ERR_btree_insert_need_journal_reclaim; ++ ++ /* ++ * bch2_varint_decode can read past the end of the buffer by at most 7 ++ * bytes (it won't be used): ++ */ ++ u64s += 1; ++ ++ if (u64s <= ck->u64s) ++ return 0; ++ ++ new_u64s = roundup_pow_of_two(u64s); ++ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); ++ if (!new_k) { ++ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", ++ bch2_btree_ids[path->btree_id], new_u64s); ++ return -BCH_ERR_ENOMEM_btree_key_cache_insert; ++ } ++ ++ trans_for_each_update(trans, i) ++ if (i->old_v == &ck->k->v) ++ i->old_v = &new_k->v; ++ ++ ck->u64s = new_u64s; ++ ck->k = new_k; ++ return 0; ++} ++ ++/* Triggers: */ ++ ++static int run_one_mem_trigger(struct btree_trans *trans, ++ struct btree_insert_entry *i, ++ unsigned flags) ++{ ++ struct bkey_s_c old = { &i->old_k, i->old_v }; ++ struct bkey_i *new = i->k; ++ const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); ++ const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); ++ int ret; ++ ++ verify_update_old_key(trans, i); ++ ++ if (unlikely(flags & BTREE_TRIGGER_NORUN)) ++ return 0; ++ ++ if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id)) ++ return 0; ++ ++ if (old_ops->atomic_trigger == new_ops->atomic_trigger && ++ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { ++ ret = bch2_mark_key(trans, i->btree_id, i->level, ++ old, bkey_i_to_s_c(new), ++ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); ++ } else { ++ struct bkey _deleted = KEY(0, 0, 0); ++ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; ++ ++ _deleted.p = i->path->pos; ++ ++ ret = bch2_mark_key(trans, i->btree_id, i->level, ++ deleted, bkey_i_to_s_c(new), ++ BTREE_TRIGGER_INSERT|flags) ?: ++ bch2_mark_key(trans, i->btree_id, i->level, ++ old, deleted, ++ BTREE_TRIGGER_OVERWRITE|flags); ++ } ++ ++ return ret; ++} ++ ++static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, ++ bool overwrite) ++{ ++ /* ++ * Transactional triggers create new btree_insert_entries, so we can't ++ * pass them a pointer to a btree_insert_entry, that memory is going to ++ * move: ++ */ ++ struct bkey old_k = i->old_k; ++ struct bkey_s_c old = { &old_k, i->old_v }; ++ const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); ++ const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); ++ ++ verify_update_old_key(trans, i); ++ ++ if ((i->flags & BTREE_TRIGGER_NORUN) || ++ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) ++ return 0; ++ ++ if (!i->insert_trigger_run && ++ !i->overwrite_trigger_run && ++ old_ops->trans_trigger == new_ops->trans_trigger && ++ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { ++ i->overwrite_trigger_run = true; ++ i->insert_trigger_run = true; ++ return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, ++ BTREE_TRIGGER_INSERT| ++ BTREE_TRIGGER_OVERWRITE| ++ i->flags) ?: 1; ++ } else if (overwrite && !i->overwrite_trigger_run) { ++ i->overwrite_trigger_run = true; ++ return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; ++ } else if (!overwrite && !i->insert_trigger_run) { ++ i->insert_trigger_run = true; ++ return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; ++ } else { ++ return 0; ++ } ++} ++ ++static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, ++ struct btree_insert_entry *btree_id_start) ++{ ++ struct btree_insert_entry *i; ++ bool trans_trigger_run; ++ int ret, overwrite; ++ ++ for (overwrite = 1; overwrite >= 0; --overwrite) { ++ ++ /* ++ * Running triggers will append more updates to the list of updates as ++ * we're walking it: ++ */ ++ do { ++ trans_trigger_run = false; ++ ++ for (i = btree_id_start; ++ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; ++ i++) { ++ if (i->btree_id != btree_id) ++ continue; ++ ++ ret = run_one_trans_trigger(trans, i, overwrite); ++ if (ret < 0) ++ return ret; ++ if (ret) ++ trans_trigger_run = true; ++ } ++ } while (trans_trigger_run); ++ } ++ ++ return 0; ++} ++ ++static int bch2_trans_commit_run_triggers(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; ++ unsigned btree_id = 0; ++ int ret = 0; ++ ++ /* ++ * ++ * For a given btree, this algorithm runs insert triggers before ++ * overwrite triggers: this is so that when extents are being moved ++ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before ++ * they are re-added. ++ */ ++ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { ++ if (btree_id == BTREE_ID_alloc) ++ continue; ++ ++ while (btree_id_start < trans->updates + trans->nr_updates && ++ btree_id_start->btree_id < btree_id) ++ btree_id_start++; ++ ++ ret = run_btree_triggers(trans, btree_id, btree_id_start); ++ if (ret) ++ return ret; ++ } ++ ++ trans_for_each_update(trans, i) { ++ if (i->btree_id > BTREE_ID_alloc) ++ break; ++ if (i->btree_id == BTREE_ID_alloc) { ++ ret = run_btree_triggers(trans, BTREE_ID_alloc, i); ++ if (ret) ++ return ret; ++ break; ++ } ++ } ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans_for_each_update(trans, i) ++ BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && ++ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && ++ (!i->insert_trigger_run || !i->overwrite_trigger_run)); ++#endif ++ return 0; ++} ++ ++static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ int ret = 0; ++ ++ trans_for_each_update(trans, i) { ++ /* ++ * XXX: synchronization of cached update triggers with gc ++ * XXX: synchronization of interior node updates with gc ++ */ ++ BUG_ON(i->cached || i->level); ++ ++ if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { ++ ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); ++ if (ret) ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++static inline int ++bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, ++ struct btree_insert_entry **stopped_at, ++ unsigned long trace_ip) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ struct btree_write_buffered_key *wb; ++ struct btree_trans_commit_hook *h; ++ unsigned u64s = 0; ++ bool marking = false; ++ int ret; ++ ++ if (race_fault()) { ++ trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); ++ return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); ++ } ++ ++ /* ++ * Check if the insert will fit in the leaf node with the write lock ++ * held, otherwise another thread could write the node changing the ++ * amount of space available: ++ */ ++ ++ prefetch(&trans->c->journal.flags); ++ ++ trans_for_each_update(trans, i) { ++ /* Multiple inserts might go to same leaf: */ ++ if (!same_leaf_as_prev(trans, i)) ++ u64s = 0; ++ ++ u64s += i->k->k.u64s; ++ ret = !i->cached ++ ? btree_key_can_insert(trans, insert_l(i)->b, u64s) ++ : btree_key_can_insert_cached(trans, flags, i->path, u64s); ++ if (ret) { ++ *stopped_at = i; ++ return ret; ++ } ++ ++ if (btree_node_type_needs_gc(i->bkey_type)) ++ marking = true; ++ } ++ ++ if (trans->nr_wb_updates && ++ trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) ++ return -BCH_ERR_btree_insert_need_flush_buffer; ++ ++ /* ++ * Don't get journal reservation until after we know insert will ++ * succeed: ++ */ ++ if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ ret = bch2_trans_journal_res_get(trans, ++ (flags & BCH_WATERMARK_MASK)| ++ JOURNAL_RES_GET_NONBLOCK); ++ if (ret) ++ return ret; ++ ++ if (unlikely(trans->journal_transaction_names)) ++ journal_transaction_name(trans); ++ } else { ++ trans->journal_res.seq = c->journal.replay_journal_seq; ++ } ++ ++ /* ++ * Not allowed to fail after we've gotten our journal reservation - we ++ * have to use it: ++ */ ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && ++ !(flags & BTREE_INSERT_JOURNAL_REPLAY)) { ++ if (bch2_journal_seq_verify) ++ trans_for_each_update(trans, i) ++ i->k->k.version.lo = trans->journal_res.seq; ++ else if (bch2_inject_invalid_keys) ++ trans_for_each_update(trans, i) ++ i->k->k.version = MAX_VERSION; ++ } ++ ++ if (trans->fs_usage_deltas && ++ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) ++ return -BCH_ERR_btree_insert_need_mark_replicas; ++ ++ if (trans->nr_wb_updates) { ++ EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY); ++ ++ ret = bch2_btree_insert_keys_write_buffer(trans); ++ if (ret) ++ goto revert_fs_usage; ++ } ++ ++ h = trans->hooks; ++ while (h) { ++ ret = h->fn(trans, h); ++ if (ret) ++ goto revert_fs_usage; ++ h = h->next; ++ } ++ ++ trans_for_each_update(trans, i) ++ if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { ++ ret = run_one_mem_trigger(trans, i, i->flags); ++ if (ret) ++ goto fatal_err; ++ } ++ ++ if (unlikely(c->gc_pos.phase)) { ++ ret = bch2_trans_commit_run_gc_triggers(trans); ++ if (ret) ++ goto fatal_err; ++ } ++ ++ if (unlikely(trans->extra_journal_entries.nr)) { ++ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), ++ trans->extra_journal_entries.data, ++ trans->extra_journal_entries.nr); ++ ++ trans->journal_res.offset += trans->extra_journal_entries.nr; ++ trans->journal_res.u64s -= trans->extra_journal_entries.nr; ++ } ++ ++ if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ struct journal *j = &c->journal; ++ struct jset_entry *entry; ++ ++ trans_for_each_update(trans, i) { ++ if (i->key_cache_already_flushed) ++ continue; ++ ++ if (i->flags & BTREE_UPDATE_NOJOURNAL) ++ continue; ++ ++ verify_update_old_key(trans, i); ++ ++ if (trans->journal_transaction_names) { ++ entry = bch2_journal_add_entry(j, &trans->journal_res, ++ BCH_JSET_ENTRY_overwrite, ++ i->btree_id, i->level, ++ i->old_k.u64s); ++ bkey_reassemble(&entry->start[0], ++ (struct bkey_s_c) { &i->old_k, i->old_v }); ++ } ++ ++ entry = bch2_journal_add_entry(j, &trans->journal_res, ++ BCH_JSET_ENTRY_btree_keys, ++ i->btree_id, i->level, ++ i->k->k.u64s); ++ bkey_copy(&entry->start[0], i->k); ++ } ++ ++ trans_for_each_wb_update(trans, wb) { ++ entry = bch2_journal_add_entry(j, &trans->journal_res, ++ BCH_JSET_ENTRY_btree_keys, ++ wb->btree, 0, ++ wb->k.k.u64s); ++ bkey_copy(&entry->start[0], &wb->k); ++ } ++ ++ if (trans->journal_seq) ++ *trans->journal_seq = trans->journal_res.seq; ++ } ++ ++ trans_for_each_update(trans, i) { ++ i->k->k.needs_whiteout = false; ++ ++ if (!i->cached) { ++ u64 seq = trans->journal_res.seq; ++ ++ if (i->flags & BTREE_UPDATE_PREJOURNAL) ++ seq = i->seq; ++ ++ bch2_btree_insert_key_leaf(trans, i->path, i->k, seq); ++ } else if (!i->key_cache_already_flushed) ++ bch2_btree_insert_key_cached(trans, flags, i); ++ else { ++ bch2_btree_key_cache_drop(trans, i->path); ++ btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); ++ } ++ } ++ ++ return 0; ++fatal_err: ++ bch2_fatal_error(c); ++revert_fs_usage: ++ if (trans->fs_usage_deltas) ++ bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas); ++ return ret; ++} ++ ++static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) ++{ ++ while (--i >= trans->updates) { ++ if (same_leaf_as_prev(trans, i)) ++ continue; ++ ++ bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); ++ } ++ ++ trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); ++} ++ ++static inline int trans_lock_write(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) { ++ if (same_leaf_as_prev(trans, i)) ++ continue; ++ ++ if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) ++ return trans_lock_write_fail(trans, i); ++ ++ if (!i->cached) ++ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); ++ } ++ ++ return 0; ++} ++ ++static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i; ++ struct btree_write_buffered_key *wb; ++ ++ trans_for_each_update(trans, i) ++ bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); ++ ++ trans_for_each_wb_update(trans, wb) ++ bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags, ++ struct btree_insert_entry *i, ++ struct printbuf *err) ++{ ++ struct bch_fs *c = trans->c; ++ int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; ++ ++ printbuf_reset(err); ++ prt_printf(err, "invalid bkey on insert from %s -> %ps", ++ trans->fn, (void *) i->ip_allocated); ++ prt_newline(err); ++ printbuf_indent_add(err, 2); ++ ++ bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); ++ prt_newline(err); ++ ++ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), ++ i->bkey_type, rw, err); ++ bch2_print_string_as_lines(KERN_ERR, err->buf); ++ ++ bch2_inconsistent_error(c); ++ bch2_dump_trans_updates(trans); ++ ++ return -EINVAL; ++} ++#endif ++ ++/* ++ * Get journal reservation, take write locks, and attempt to do btree update(s): ++ */ ++static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, ++ struct btree_insert_entry **stopped_at, ++ unsigned long trace_ip) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ int ret = 0, u64s_delta = 0; ++ ++ trans_for_each_update(trans, i) { ++ if (i->cached) ++ continue; ++ ++ u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; ++ u64s_delta -= i->old_btree_u64s; ++ ++ if (!same_leaf_as_next(trans, i)) { ++ if (u64s_delta <= 0) { ++ ret = bch2_foreground_maybe_merge(trans, i->path, ++ i->level, flags); ++ if (unlikely(ret)) ++ return ret; ++ } ++ ++ u64s_delta = 0; ++ } ++ } ++ ++ ret = bch2_journal_preres_get(&c->journal, ++ &trans->journal_preres, trans->journal_preres_u64s, ++ (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK); ++ if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked)) ++ ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip); ++ if (unlikely(ret)) ++ return ret; ++ ++ ret = trans_lock_write(trans); ++ if (unlikely(ret)) ++ return ret; ++ ++ ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); ++ ++ if (!ret && unlikely(trans->journal_replay_not_finished)) ++ bch2_drop_overwrites_from_journal(trans); ++ ++ trans_for_each_update(trans, i) ++ if (!same_leaf_as_prev(trans, i)) ++ bch2_btree_node_unlock_write_inlined(trans, i->path, ++ insert_l(i)->b); ++ ++ if (!ret && trans->journal_pin) ++ bch2_journal_pin_add(&c->journal, trans->journal_res.seq, ++ trans->journal_pin, NULL); ++ ++ /* ++ * Drop journal reservation after dropping write locks, since dropping ++ * the journal reservation may kick off a journal write: ++ */ ++ bch2_journal_res_put(&c->journal, &trans->journal_res); ++ ++ if (unlikely(ret)) ++ return ret; ++ ++ bch2_trans_downgrade(trans); ++ ++ return 0; ++} ++ ++static int journal_reclaim_wait_done(struct bch_fs *c) ++{ ++ int ret = bch2_journal_error(&c->journal) ?: ++ !bch2_btree_key_cache_must_wait(c); ++ ++ if (!ret) ++ journal_reclaim_kick(&c->journal); ++ return ret; ++} ++ ++static noinline ++int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, ++ struct btree_insert_entry *i, ++ int ret, unsigned long trace_ip) ++{ ++ struct bch_fs *c = trans->c; ++ ++ switch (ret) { ++ case -BCH_ERR_btree_insert_btree_node_full: ++ ret = bch2_btree_split_leaf(trans, i->path, flags); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); ++ break; ++ case -BCH_ERR_btree_insert_need_mark_replicas: ++ ret = drop_locks_do(trans, ++ bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); ++ break; ++ case -BCH_ERR_journal_res_get_blocked: ++ /* ++ * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK ++ * flag ++ */ ++ if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && ++ (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { ++ ret = -BCH_ERR_journal_reclaim_would_deadlock; ++ break; ++ } ++ ++ ret = drop_locks_do(trans, ++ bch2_trans_journal_res_get(trans, ++ (flags & BCH_WATERMARK_MASK)| ++ JOURNAL_RES_GET_CHECK)); ++ break; ++ case -BCH_ERR_btree_insert_need_journal_reclaim: ++ bch2_trans_unlock(trans); ++ ++ trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); ++ ++ wait_event_freezable(c->journal.reclaim_wait, ++ (ret = journal_reclaim_wait_done(c))); ++ if (ret < 0) ++ break; ++ ++ ret = bch2_trans_relock(trans); ++ break; ++ case -BCH_ERR_btree_insert_need_flush_buffer: { ++ struct btree_write_buffer *wb = &c->btree_write_buffer; ++ ++ ret = 0; ++ ++ if (wb->state.nr > wb->size * 3 / 4) { ++ bch2_trans_unlock(trans); ++ mutex_lock(&wb->flush_lock); ++ ++ if (wb->state.nr > wb->size * 3 / 4) { ++ bch2_trans_begin(trans); ++ ret = __bch2_btree_write_buffer_flush(trans, ++ flags|BTREE_INSERT_NOCHECK_RW, true); ++ if (!ret) { ++ trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); ++ } ++ } else { ++ mutex_unlock(&wb->flush_lock); ++ ret = bch2_trans_relock(trans); ++ } ++ } ++ break; ++ } ++ default: ++ BUG_ON(ret >= 0); ++ break; ++ } ++ ++ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); ++ ++ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && ++ !(flags & BTREE_INSERT_NOWAIT) && ++ (flags & BTREE_INSERT_NOFAIL), c, ++ "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); ++ ++ return ret; ++} ++ ++static noinline int ++bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ if (likely(!(flags & BTREE_INSERT_LAZY_RW)) || ++ test_bit(BCH_FS_STARTED, &c->flags)) ++ return -BCH_ERR_erofs_trans_commit; ++ ++ ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); ++ if (ret) ++ return ret; ++ ++ bch2_write_ref_get(c, BCH_WRITE_REF_trans); ++ return 0; ++} ++ ++/* ++ * This is for updates done in the early part of fsck - btree_gc - before we've ++ * gone RW. we only add the new key to the list of keys for journal replay to ++ * do. ++ */ ++static noinline int ++do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ int ret = 0; ++ ++ trans_for_each_update(trans, i) { ++ ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); ++ if (ret) ++ break; ++ } ++ ++ return ret; ++} ++ ++int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i = NULL; ++ struct btree_write_buffered_key *wb; ++ unsigned u64s; ++ int ret = 0; ++ ++ if (!trans->nr_updates && ++ !trans->nr_wb_updates && ++ !trans->extra_journal_entries.nr) ++ goto out_reset; ++ ++ if (flags & BTREE_INSERT_GC_LOCK_HELD) ++ lockdep_assert_held(&c->gc_lock); ++ ++ ret = bch2_trans_commit_run_triggers(trans); ++ if (ret) ++ goto out_reset; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans_for_each_update(trans, i) { ++ struct printbuf buf = PRINTBUF; ++ enum bkey_invalid_flags invalid_flags = 0; ++ ++ if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) ++ invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; ++ ++ if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), ++ i->bkey_type, invalid_flags, &buf))) ++ ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf); ++ btree_insert_entry_checks(trans, i); ++ printbuf_exit(&buf); ++ ++ if (ret) ++ return ret; ++ } ++#endif ++ ++ if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { ++ ret = do_bch2_trans_commit_to_journal_replay(trans); ++ goto out_reset; ++ } ++ ++ if (!(flags & BTREE_INSERT_NOCHECK_RW) && ++ unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { ++ ret = bch2_trans_commit_get_rw_cold(trans, flags); ++ if (ret) ++ goto out_reset; ++ } ++ ++ if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && ++ mutex_trylock(&c->btree_write_buffer.flush_lock)) { ++ bch2_trans_begin(trans); ++ bch2_trans_unlock(trans); ++ ++ ret = __bch2_btree_write_buffer_flush(trans, ++ flags|BTREE_INSERT_NOCHECK_RW, true); ++ if (!ret) { ++ trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); ++ } ++ goto out; ++ } ++ ++ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); ++ ++ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); ++ ++ trans->journal_u64s = trans->extra_journal_entries.nr; ++ trans->journal_preres_u64s = 0; ++ ++ trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); ++ ++ if (trans->journal_transaction_names) ++ trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); ++ ++ trans_for_each_update(trans, i) { ++ EBUG_ON(!i->path->should_be_locked); ++ ++ ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); ++ if (unlikely(ret)) ++ goto out; ++ ++ EBUG_ON(!btree_node_intent_locked(i->path, i->level)); ++ ++ if (i->key_cache_already_flushed) ++ continue; ++ ++ /* we're going to journal the key being updated: */ ++ u64s = jset_u64s(i->k->k.u64s); ++ if (i->cached && ++ likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) ++ trans->journal_preres_u64s += u64s; ++ ++ if (i->flags & BTREE_UPDATE_NOJOURNAL) ++ continue; ++ ++ trans->journal_u64s += u64s; ++ ++ /* and we're also going to log the overwrite: */ ++ if (trans->journal_transaction_names) ++ trans->journal_u64s += jset_u64s(i->old_k.u64s); ++ } ++ ++ trans_for_each_wb_update(trans, wb) ++ trans->journal_u64s += jset_u64s(wb->k.k.u64s); ++ ++ if (trans->extra_journal_res) { ++ ret = bch2_disk_reservation_add(c, trans->disk_res, ++ trans->extra_journal_res, ++ (flags & BTREE_INSERT_NOFAIL) ++ ? BCH_DISK_RESERVATION_NOFAIL : 0); ++ if (ret) ++ goto err; ++ } ++retry: ++ bch2_trans_verify_not_in_restart(trans); ++ memset(&trans->journal_res, 0, sizeof(trans->journal_res)); ++ ++ ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); ++ ++ /* make sure we didn't drop or screw up locks: */ ++ bch2_trans_verify_locks(trans); ++ ++ if (ret) ++ goto err; ++ ++ trace_and_count(c, transaction_commit, trans, _RET_IP_); ++out: ++ bch2_journal_preres_put(&c->journal, &trans->journal_preres); ++ ++ if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) ++ bch2_write_ref_put(c, BCH_WRITE_REF_trans); ++out_reset: ++ bch2_trans_reset_updates(trans); ++ ++ return ret; ++err: ++ ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_); ++ if (ret) ++ goto out; ++ ++ goto retry; ++} diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h new file mode 100644 -index 000000000..d95360160 +index 000000000..71ad3893e --- /dev/null +++ b/fs/bcachefs/btree_types.h -@@ -0,0 +1,743 @@ +@@ -0,0 +1,746 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_TYPES_H +#define _BCACHEFS_BTREE_TYPES_H + +#include +#include -+#include + +//#include "bkey_methods.h" +#include "buckets_types.h" @@ -29132,6 +30634,7 @@ index 000000000..d95360160 +#include "errcode.h" +#include "journal_types.h" +#include "replicas_types.h" ++#include "six.h" + +struct open_bucket; +struct btree_update; @@ -29763,7 +31266,7 @@ index 000000000..d95360160 +} + +enum btree_node_type { -+#define x(kwd, val) BKEY_TYPE_##kwd = val, ++#define x(kwd, val, ...) BKEY_TYPE_##kwd = val, + BCH_BTREE_IDS() +#undef x + BKEY_TYPE_btree, @@ -29782,31 +31285,37 @@ index 000000000..d95360160 +} + +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ -+ ((1U << BKEY_TYPE_extents)| \ -+ (1U << BKEY_TYPE_alloc)| \ -+ (1U << BKEY_TYPE_inodes)| \ -+ (1U << BKEY_TYPE_stripes)| \ -+ (1U << BKEY_TYPE_reflink)| \ -+ (1U << BKEY_TYPE_btree)) ++ (BIT(BKEY_TYPE_extents)| \ ++ BIT(BKEY_TYPE_alloc)| \ ++ BIT(BKEY_TYPE_inodes)| \ ++ BIT(BKEY_TYPE_stripes)| \ ++ BIT(BKEY_TYPE_reflink)| \ ++ BIT(BKEY_TYPE_btree)) + +#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ -+ ((1U << BKEY_TYPE_alloc)| \ -+ (1U << BKEY_TYPE_inodes)| \ -+ (1U << BKEY_TYPE_stripes)| \ -+ (1U << BKEY_TYPE_snapshots)) ++ (BIT(BKEY_TYPE_alloc)| \ ++ BIT(BKEY_TYPE_inodes)| \ ++ BIT(BKEY_TYPE_stripes)| \ ++ BIT(BKEY_TYPE_snapshots)) + +#define BTREE_NODE_TYPE_HAS_TRIGGERS \ + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ + BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + -+#define BTREE_ID_IS_EXTENTS \ -+ ((1U << BTREE_ID_extents)| \ -+ (1U << BTREE_ID_reflink)| \ -+ (1U << BTREE_ID_freespace)) ++static inline bool btree_node_type_needs_gc(enum btree_node_type type) ++{ ++ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); ++} + +static inline bool btree_node_type_is_extents(enum btree_node_type type) +{ -+ return (1U << type) & BTREE_ID_IS_EXTENTS; ++ const unsigned mask = 0 ++#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << nr) ++ BCH_BTREE_IDS() ++#undef x ++ ; ++ ++ return (1U << type) & mask; +} + +static inline bool btree_id_is_extents(enum btree_id btree) @@ -29814,29 +31323,26 @@ index 000000000..d95360160 + return btree_node_type_is_extents((enum btree_node_type) btree); +} + -+#define BTREE_ID_HAS_SNAPSHOTS \ -+ ((1U << BTREE_ID_extents)| \ -+ (1U << BTREE_ID_inodes)| \ -+ (1U << BTREE_ID_dirents)| \ -+ (1U << BTREE_ID_xattrs)) -+ -+#define BTREE_ID_HAS_PTRS \ -+ ((1U << BTREE_ID_extents)| \ -+ (1U << BTREE_ID_reflink)) -+ +static inline bool btree_type_has_snapshots(enum btree_id id) +{ -+ return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; ++ const unsigned mask = 0 ++#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr) ++ BCH_BTREE_IDS() ++#undef x ++ ; ++ ++ return (1U << id) & mask; +} + +static inline bool btree_type_has_ptrs(enum btree_id id) +{ -+ return (1 << id) & BTREE_ID_HAS_PTRS; -+} ++ const unsigned mask = 0 ++#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr) ++ BCH_BTREE_IDS() ++#undef x ++ ; + -+static inline bool btree_node_type_needs_gc(enum btree_node_type type) -+{ -+ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); ++ return (1U << id) & mask; +} + +struct btree_root { @@ -29861,12 +31367,916 @@ index 000000000..d95360160 +}; + +#endif /* _BCACHEFS_BTREE_TYPES_H */ +diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c +new file mode 100644 +index 000000000..880ce7431 +--- /dev/null ++++ b/fs/bcachefs/btree_update.c +@@ -0,0 +1,898 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "btree_iter.h" ++#include "btree_journal_iter.h" ++#include "btree_locking.h" ++#include "buckets.h" ++#include "debug.h" ++#include "errcode.h" ++#include "error.h" ++#include "extents.h" ++#include "keylist.h" ++#include "snapshot.h" ++#include "trace.h" ++ ++static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, ++ const struct btree_insert_entry *r) ++{ ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ cmp_int(l->cached, r->cached) ?: ++ -cmp_int(l->level, r->level) ?: ++ bpos_cmp(l->k->k.p, r->k->k.p); ++} ++ ++static int __must_check ++bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, ++ struct bkey_i *, enum btree_update_flags, ++ unsigned long ip); ++ ++static noinline int extent_front_merge(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ struct bkey_i **insert, ++ enum btree_update_flags flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_i *update; ++ int ret; ++ ++ update = bch2_bkey_make_mut_noupdate(trans, k); ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ return ret; ++ ++ if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) ++ return 0; ++ ++ ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?: ++ bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p); ++ if (ret < 0) ++ return ret; ++ if (ret) ++ return 0; ++ ++ ret = bch2_btree_delete_at(trans, iter, flags); ++ if (ret) ++ return ret; ++ ++ *insert = update; ++ return 0; ++} ++ ++static noinline int extent_back_merge(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?: ++ bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p); ++ if (ret < 0) ++ return ret; ++ if (ret) ++ return 0; ++ ++ bch2_bkey_merge(c, bkey_i_to_s(insert), k); ++ return 0; ++} ++ ++/* ++ * When deleting, check if we need to emit a whiteout (because we're overwriting ++ * something in an ancestor snapshot) ++ */ ++static int need_whiteout_for_snapshot(struct btree_trans *trans, ++ enum btree_id btree_id, struct bpos pos) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u32 snapshot = pos.snapshot; ++ int ret; ++ ++ if (!bch2_snapshot_parent(trans->c, pos.snapshot)) ++ return 0; ++ ++ pos.snapshot++; ++ ++ for_each_btree_key_norestart(trans, iter, btree_id, pos, ++ BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_NOPRESERVE, k, ret) { ++ if (!bkey_eq(k.k->p, pos)) ++ break; ++ ++ if (bch2_snapshot_is_ancestor(trans->c, snapshot, ++ k.k->p.snapshot)) { ++ ret = !bkey_whiteout(k.k); ++ break; ++ } ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++} ++ ++int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, ++ enum btree_id id, ++ struct bpos old_pos, ++ struct bpos new_pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter old_iter, new_iter = { NULL }; ++ struct bkey_s_c old_k, new_k; ++ snapshot_id_list s; ++ struct bkey_i *update; ++ int ret; ++ ++ if (!bch2_snapshot_has_children(c, old_pos.snapshot)) ++ return 0; ++ ++ darray_init(&s); ++ ++ bch2_trans_iter_init(trans, &old_iter, id, old_pos, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS); ++ while ((old_k = bch2_btree_iter_prev(&old_iter)).k && ++ !(ret = bkey_err(old_k)) && ++ bkey_eq(old_pos, old_k.k->p)) { ++ struct bpos whiteout_pos = ++ SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; ++ ++ if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || ++ snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) ++ continue; ++ ++ new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); ++ ret = bkey_err(new_k); ++ if (ret) ++ break; ++ ++ if (new_k.k->type == KEY_TYPE_deleted) { ++ update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ++ ret = PTR_ERR_OR_ZERO(update); ++ if (ret) ++ break; ++ ++ bkey_init(&update->k); ++ update->k.p = whiteout_pos; ++ update->k.type = KEY_TYPE_whiteout; ++ ++ ret = bch2_trans_update(trans, &new_iter, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ } ++ bch2_trans_iter_exit(trans, &new_iter); ++ ++ ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(trans, &new_iter); ++ bch2_trans_iter_exit(trans, &old_iter); ++ darray_exit(&s); ++ ++ return ret; ++} ++ ++int bch2_trans_update_extent_overwrite(struct btree_trans *trans, ++ struct btree_iter *iter, ++ enum btree_update_flags flags, ++ struct bkey_s_c old, ++ struct bkey_s_c new) ++{ ++ enum btree_id btree_id = iter->btree_id; ++ struct bkey_i *update; ++ struct bpos new_start = bkey_start_pos(new.k); ++ bool front_split = bkey_lt(bkey_start_pos(old.k), new_start); ++ bool back_split = bkey_gt(old.k->p, new.k->p); ++ int ret = 0, compressed_sectors; ++ ++ /* ++ * If we're going to be splitting a compressed extent, note it ++ * so that __bch2_trans_commit() can increase our disk ++ * reservation: ++ */ ++ if (((front_split && back_split) || ++ ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) && ++ (compressed_sectors = bch2_bkey_sectors_compressed(old))) ++ trans->extra_journal_res += compressed_sectors; ++ ++ if (front_split) { ++ update = bch2_bkey_make_mut_noupdate(trans, old); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ return ret; ++ ++ bch2_cut_back(new_start, update); ++ ++ ret = bch2_insert_snapshot_whiteouts(trans, btree_id, ++ old.k->p, update->k.p) ?: ++ bch2_btree_insert_nonextent(trans, btree_id, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); ++ if (ret) ++ return ret; ++ } ++ ++ /* If we're overwriting in a different snapshot - middle split: */ ++ if (old.k->p.snapshot != new.k->p.snapshot && ++ (front_split || back_split)) { ++ update = bch2_bkey_make_mut_noupdate(trans, old); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ return ret; ++ ++ bch2_cut_front(new_start, update); ++ bch2_cut_back(new.k->p, update); ++ ++ ret = bch2_insert_snapshot_whiteouts(trans, btree_id, ++ old.k->p, update->k.p) ?: ++ bch2_btree_insert_nonextent(trans, btree_id, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); ++ if (ret) ++ return ret; ++ } ++ ++ if (bkey_le(old.k->p, new.k->p)) { ++ update = bch2_trans_kmalloc(trans, sizeof(*update)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ return ret; ++ ++ bkey_init(&update->k); ++ update->k.p = old.k->p; ++ update->k.p.snapshot = new.k->p.snapshot; ++ ++ if (new.k->p.snapshot != old.k->p.snapshot) { ++ update->k.type = KEY_TYPE_whiteout; ++ } else if (btree_type_has_snapshots(btree_id)) { ++ ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); ++ if (ret < 0) ++ return ret; ++ if (ret) ++ update->k.type = KEY_TYPE_whiteout; ++ } ++ ++ ret = bch2_btree_insert_nonextent(trans, btree_id, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); ++ if (ret) ++ return ret; ++ } ++ ++ if (back_split) { ++ update = bch2_bkey_make_mut_noupdate(trans, old); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ return ret; ++ ++ bch2_cut_front(new.k->p, update); ++ ++ ret = bch2_trans_update_by_path(trans, iter->path, update, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| ++ flags, _RET_IP_); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int bch2_trans_update_extent(struct btree_trans *trans, ++ struct btree_iter *orig_iter, ++ struct bkey_i *insert, ++ enum btree_update_flags flags) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ enum btree_id btree_id = orig_iter->btree_id; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), ++ BTREE_ITER_INTENT| ++ BTREE_ITER_WITH_UPDATES| ++ BTREE_ITER_NOT_EXTENTS); ++ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); ++ if ((ret = bkey_err(k))) ++ goto err; ++ if (!k.k) ++ goto out; ++ ++ if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) { ++ if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { ++ ret = extent_front_merge(trans, &iter, k, &insert, flags); ++ if (ret) ++ goto err; ++ } ++ ++ goto next; ++ } ++ ++ while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { ++ bool done = bkey_lt(insert->k.p, k.k->p); ++ ++ ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); ++ if (ret) ++ goto err; ++ ++ if (done) ++ goto out; ++next: ++ bch2_btree_iter_advance(&iter); ++ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); ++ if ((ret = bkey_err(k))) ++ goto err; ++ if (!k.k) ++ goto out; ++ } ++ ++ if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { ++ ret = extent_back_merge(trans, &iter, insert, k); ++ if (ret) ++ goto err; ++ } ++out: ++ if (!bkey_deleted(&insert->k)) ++ ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++} ++ ++static noinline int flush_new_cached_update(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree_insert_entry *i, ++ enum btree_update_flags flags, ++ unsigned long ip) ++{ ++ struct btree_path *btree_path; ++ struct bkey k; ++ int ret; ++ ++ btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, ++ BTREE_ITER_INTENT, _THIS_IP_); ++ ret = bch2_btree_path_traverse(trans, btree_path, 0); ++ if (ret) ++ goto out; ++ ++ /* ++ * The old key in the insert entry might actually refer to an existing ++ * key in the btree that has been deleted from cache and not yet ++ * flushed. Check for this and skip the flush so we don't run triggers ++ * against a stale key. ++ */ ++ bch2_btree_path_peek_slot_exact(btree_path, &k); ++ if (!bkey_deleted(&k)) ++ goto out; ++ ++ i->key_cache_already_flushed = true; ++ i->flags |= BTREE_TRIGGER_NORUN; ++ ++ btree_path_set_should_be_locked(btree_path); ++ ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip); ++out: ++ bch2_path_put(trans, btree_path, true); ++ return ret; ++} ++ ++static int __must_check ++bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, ++ struct bkey_i *k, enum btree_update_flags flags, ++ unsigned long ip) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i, n; ++ u64 seq = 0; ++ int cmp; ++ ++ EBUG_ON(!path->should_be_locked); ++ EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); ++ EBUG_ON(!bpos_eq(k->k.p, path->pos)); ++ ++ /* ++ * The transaction journal res hasn't been allocated at this point. ++ * That occurs at commit time. Reuse the seq field to pass in the seq ++ * of a prejournaled key. ++ */ ++ if (flags & BTREE_UPDATE_PREJOURNAL) ++ seq = trans->journal_res.seq; ++ ++ n = (struct btree_insert_entry) { ++ .flags = flags, ++ .bkey_type = __btree_node_type(path->level, path->btree_id), ++ .btree_id = path->btree_id, ++ .level = path->level, ++ .cached = path->cached, ++ .path = path, ++ .k = k, ++ .seq = seq, ++ .ip_allocated = ip, ++ }; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans_for_each_update(trans, i) ++ BUG_ON(i != trans->updates && ++ btree_insert_entry_cmp(i - 1, i) >= 0); ++#endif ++ ++ /* ++ * Pending updates are kept sorted: first, find position of new update, ++ * then delete/trim any updates the new update overwrites: ++ */ ++ trans_for_each_update(trans, i) { ++ cmp = btree_insert_entry_cmp(&n, i); ++ if (cmp <= 0) ++ break; ++ } ++ ++ if (!cmp && i < trans->updates + trans->nr_updates) { ++ EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); ++ ++ bch2_path_put(trans, i->path, true); ++ i->flags = n.flags; ++ i->cached = n.cached; ++ i->k = n.k; ++ i->path = n.path; ++ i->seq = n.seq; ++ i->ip_allocated = n.ip_allocated; ++ } else { ++ array_insert_item(trans->updates, trans->nr_updates, ++ i - trans->updates, n); ++ ++ i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v; ++ i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; ++ ++ if (unlikely(trans->journal_replay_not_finished)) { ++ struct bkey_i *j_k = ++ bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); ++ ++ if (j_k) { ++ i->old_k = j_k->k; ++ i->old_v = &j_k->v; ++ } ++ } ++ } ++ ++ __btree_path_get(i->path, true); ++ ++ /* ++ * If a key is present in the key cache, it must also exist in the ++ * btree - this is necessary for cache coherency. When iterating over ++ * a btree that's cached in the key cache, the btree iter code checks ++ * the key cache - but the key has to exist in the btree for that to ++ * work: ++ */ ++ if (path->cached && bkey_deleted(&i->old_k)) ++ return flush_new_cached_update(trans, path, i, flags, ip); ++ ++ return 0; ++} ++ ++int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_i *k, enum btree_update_flags flags) ++{ ++ struct btree_path *path = iter->update_path ?: iter->path; ++ struct bkey_cached *ck; ++ int ret; ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ return bch2_trans_update_extent(trans, iter, k, flags); ++ ++ if (bkey_deleted(&k->k) && ++ !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && ++ (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { ++ ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); ++ if (unlikely(ret < 0)) ++ return ret; ++ ++ if (ret) ++ k->k.type = KEY_TYPE_whiteout; ++ } ++ ++ /* ++ * Ensure that updates to cached btrees go to the key cache: ++ */ ++ if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && ++ !path->cached && ++ !path->level && ++ btree_id_cached(trans->c, path->btree_id)) { ++ if (!iter->key_cache_path || ++ !iter->key_cache_path->should_be_locked || ++ !bpos_eq(iter->key_cache_path->pos, k->k.p)) { ++ if (!iter->key_cache_path) ++ iter->key_cache_path = ++ bch2_path_get(trans, path->btree_id, path->pos, 1, 0, ++ BTREE_ITER_INTENT| ++ BTREE_ITER_CACHED, _THIS_IP_); ++ ++ iter->key_cache_path = ++ bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, ++ iter->flags & BTREE_ITER_INTENT, ++ _THIS_IP_); ++ ++ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, ++ BTREE_ITER_CACHED); ++ if (unlikely(ret)) ++ return ret; ++ ++ ck = (void *) iter->key_cache_path->l[0].b; ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); ++ } ++ ++ btree_path_set_should_be_locked(iter->key_cache_path); ++ } ++ ++ path = iter->key_cache_path; ++ } ++ ++ return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); ++} ++ ++/* ++ * Add a transaction update for a key that has already been journaled. ++ */ ++int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq, ++ struct btree_iter *iter, struct bkey_i *k, ++ enum btree_update_flags flags) ++{ ++ trans->journal_res.seq = seq; ++ return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL| ++ BTREE_UPDATE_PREJOURNAL); ++} ++ ++int __must_check bch2_trans_update_buffered(struct btree_trans *trans, ++ enum btree_id btree, ++ struct bkey_i *k) ++{ ++ struct btree_write_buffered_key *i; ++ int ret; ++ ++ EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); ++ EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); ++ ++ trans_for_each_wb_update(trans, i) { ++ if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { ++ bkey_copy(&i->k, k); ++ return 0; ++ } ++ } ++ ++ if (!trans->wb_updates || ++ trans->nr_wb_updates == trans->wb_updates_size) { ++ struct btree_write_buffered_key *u; ++ ++ if (trans->nr_wb_updates == trans->wb_updates_size) { ++ struct btree_transaction_stats *s = btree_trans_stats(trans); ++ ++ BUG_ON(trans->wb_updates_size > U8_MAX / 2); ++ trans->wb_updates_size = max(1, trans->wb_updates_size * 2); ++ if (s) ++ s->wb_updates_size = trans->wb_updates_size; ++ } ++ ++ u = bch2_trans_kmalloc_nomemzero(trans, ++ trans->wb_updates_size * ++ sizeof(struct btree_write_buffered_key)); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ return ret; ++ ++ if (trans->nr_wb_updates) ++ memcpy(u, trans->wb_updates, trans->nr_wb_updates * ++ sizeof(struct btree_write_buffered_key)); ++ trans->wb_updates = u; ++ } ++ ++ trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { ++ .btree = btree, ++ }; ++ ++ bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); ++ trans->nr_wb_updates++; ++ ++ return 0; ++} ++ ++int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, ++ enum btree_id btree, struct bpos end) ++{ ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT); ++ k = bch2_btree_iter_prev(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ bch2_btree_iter_advance(iter); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ BUG_ON(k.k->type != KEY_TYPE_deleted); ++ ++ if (bkey_gt(k.k->p, end)) { ++ ret = -BCH_ERR_ENOSPC_btree_slot; ++ goto err; ++ } ++ ++ return 0; ++err: ++ bch2_trans_iter_exit(trans, iter); ++ return ret; ++} ++ ++void bch2_trans_commit_hook(struct btree_trans *trans, ++ struct btree_trans_commit_hook *h) ++{ ++ h->next = trans->hooks; ++ trans->hooks = h; ++} ++ ++int bch2_btree_insert_nonextent(struct btree_trans *trans, ++ enum btree_id btree, struct bkey_i *k, ++ enum btree_update_flags flags) ++{ ++ struct btree_iter iter; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, btree, k->k.p, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(trans, &iter, k, flags); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, ++ struct bkey_i *k, enum btree_update_flags flags) ++{ ++ struct btree_iter iter; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_INTENT); ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(trans, &iter, k, flags); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++/** ++ * bch2_btree_insert - insert keys into the extent btree ++ * @c: pointer to struct bch_fs ++ * @id: btree to insert into ++ * @insert_keys: list of keys to insert ++ * @hook: insert callback ++ */ ++int bch2_btree_insert(struct bch_fs *c, enum btree_id id, ++ struct bkey_i *k, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, int flags) ++{ ++ return bch2_trans_do(c, disk_res, journal_seq, flags, ++ __bch2_btree_insert(&trans, id, k, 0)); ++} ++ ++int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, ++ unsigned len, unsigned update_flags) ++{ ++ struct bkey_i *k; ++ ++ k = bch2_trans_kmalloc(trans, sizeof(*k)); ++ if (IS_ERR(k)) ++ return PTR_ERR(k); ++ ++ bkey_init(&k->k); ++ k->k.p = iter->pos; ++ bch2_key_resize(&k->k, len); ++ return bch2_trans_update(trans, iter, k, update_flags); ++} ++ ++int bch2_btree_delete_at(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned update_flags) ++{ ++ return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); ++} ++ ++int bch2_btree_delete_at_buffered(struct btree_trans *trans, ++ enum btree_id btree, struct bpos pos) ++{ ++ struct bkey_i *k; ++ ++ k = bch2_trans_kmalloc(trans, sizeof(*k)); ++ if (IS_ERR(k)) ++ return PTR_ERR(k); ++ ++ bkey_init(&k->k); ++ k->k.p = pos; ++ return bch2_trans_update_buffered(trans, btree, k); ++} ++ ++int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, ++ struct bpos start, struct bpos end, ++ unsigned update_flags, ++ u64 *journal_seq) ++{ ++ u32 restart_count = trans->restart_count; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); ++ while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(trans->c, 0); ++ struct bkey_i delete; ++ ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ bkey_init(&delete.k); ++ ++ /* ++ * This could probably be more efficient for extents: ++ */ ++ ++ /* ++ * For extents, iter.pos won't necessarily be the same as ++ * bkey_start_pos(k.k) (for non extents they always will be the ++ * same). It's important that we delete starting from iter.pos ++ * because the range we want to delete could start in the middle ++ * of k. ++ * ++ * (bch2_btree_iter_peek() does guarantee that iter.pos >= ++ * bkey_start_pos(k.k)). ++ */ ++ delete.k.p = iter.pos; ++ ++ if (iter.flags & BTREE_ITER_IS_EXTENTS) ++ bch2_key_resize(&delete.k, ++ bpos_min(end, k.k->p).offset - ++ iter.pos.offset); ++ ++ ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: ++ bch2_trans_commit(trans, &disk_res, journal_seq, ++ BTREE_INSERT_NOFAIL); ++ bch2_disk_reservation_put(trans->c, &disk_res); ++err: ++ /* ++ * the bch2_trans_begin() call is in a weird place because we ++ * need to call it after every transaction commit, to avoid path ++ * overflow, but don't want to call it if the delete operation ++ * is a no-op and we have no work to do: ++ */ ++ bch2_trans_begin(trans); ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ ret = 0; ++ if (ret) ++ break; ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ if (!ret && trans_was_restarted(trans, restart_count)) ++ ret = -BCH_ERR_transaction_restart_nested; ++ return ret; ++} ++ ++/* ++ * bch_btree_delete_range - delete everything within a given range ++ * ++ * Range is a half open interval - [start, end) ++ */ ++int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, ++ struct bpos start, struct bpos end, ++ unsigned update_flags, ++ u64 *journal_seq) ++{ ++ int ret = bch2_trans_run(c, ++ bch2_btree_delete_range_trans(&trans, id, start, end, ++ update_flags, journal_seq)); ++ if (ret == -BCH_ERR_transaction_restart_nested) ++ ret = 0; ++ return ret; ++} ++ ++int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, ++ struct bpos pos, bool set) ++{ ++ struct bkey_i *k; ++ int ret = 0; ++ ++ k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); ++ ret = PTR_ERR_OR_ZERO(k); ++ if (unlikely(ret)) ++ return ret; ++ ++ bkey_init(&k->k); ++ k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; ++ k->k.p = pos; ++ ++ return bch2_trans_update_buffered(trans, btree, k); ++} ++ ++static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) ++{ ++ struct printbuf buf = PRINTBUF; ++ struct jset_entry_log *l; ++ unsigned u64s; ++ int ret; ++ ++ prt_vprintf(&buf, fmt, args); ++ ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; ++ if (ret) ++ goto err; ++ ++ u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); ++ ++ ret = darray_make_room(entries, jset_u64s(u64s)); ++ if (ret) ++ goto err; ++ ++ l = (void *) &darray_top(*entries); ++ l->entry.u64s = cpu_to_le16(u64s); ++ l->entry.btree_id = 0; ++ l->entry.level = 1; ++ l->entry.type = BCH_JSET_ENTRY_log; ++ l->entry.pad[0] = 0; ++ l->entry.pad[1] = 0; ++ l->entry.pad[2] = 0; ++ memcpy(l->d, buf.buf, buf.pos); ++ while (buf.pos & 7) ++ l->d[buf.pos++] = '\0'; ++ ++ entries->nr += jset_u64s(u64s); ++err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++static int ++__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, ++ va_list args) ++{ ++ int ret; ++ ++ if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { ++ ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); ++ } else { ++ ret = bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_LAZY_RW|commit_flags, ++ __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args)); ++ } ++ ++ return ret; ++} ++ ++int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) ++{ ++ va_list args; ++ int ret; ++ ++ va_start(args, fmt); ++ ret = __bch2_fs_log_msg(c, 0, fmt, args); ++ va_end(args); ++ return ret; ++} ++ ++/* ++ * Use for logging messages during recovery to enable reserved space and avoid ++ * blocking. ++ */ ++int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) ++{ ++ va_list args; ++ int ret; ++ ++ va_start(args, fmt); ++ ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args); ++ va_end(args); ++ return ret; ++} diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h new file mode 100644 -index 000000000..d6aec9341 +index 000000000..901c42b57 --- /dev/null +++ b/fs/bcachefs/btree_update.h -@@ -0,0 +1,352 @@ +@@ -0,0 +1,353 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_UPDATE_H +#define _BCACHEFS_BTREE_UPDATE_H @@ -29965,8 +32375,9 @@ index 000000000..d6aec9341 + return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos); +} + -+int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, enum btree_update_flags); ++int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, ++ enum btree_update_flags, ++ struct bkey_s_c, struct bkey_s_c); + +int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, + enum btree_id, struct bpos); @@ -30136,10 +32547,10 @@ index 000000000..d6aec9341 +{ + struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, + btree_id, pos, flags|BTREE_ITER_INTENT, type); -+ struct bkey_i *ret = unlikely(IS_ERR(k.k)) ++ struct bkey_i *ret = IS_ERR(k.k) + ? ERR_CAST(k.k) + : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); -+ if (unlikely(IS_ERR(ret))) ++ if (IS_ERR(ret)) + bch2_trans_iter_exit(trans, iter); + return ret; +} @@ -30221,7 +32632,7 @@ index 000000000..d6aec9341 +#endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c new file mode 100644 -index 000000000..3659b2c08 +index 000000000..c741150e6 --- /dev/null +++ b/fs/bcachefs/btree_update_interior.c @@ -0,0 +1,2488 @@ @@ -30232,6 +32643,7 @@ index 000000000..3659b2c08 +#include "bkey_methods.h" +#include "btree_cache.h" +#include "btree_gc.h" ++#include "btree_journal_iter.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_io.h" @@ -30244,7 +32656,6 @@ index 000000000..3659b2c08 +#include "journal.h" +#include "journal_reclaim.h" +#include "keylist.h" -+#include "recovery.h" +#include "replicas.h" +#include "super-io.h" +#include "trace.h" @@ -30415,7 +32826,7 @@ index 000000000..3659b2c08 + bch2_btree_node_hash_remove(&c->btree_cache, b); + __btree_node_free(c, b); + six_unlock_write(&b->c.lock); -+ mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent); ++ mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); + + trans_for_each_path(trans, path) + if (path->l[level].b == b) { @@ -30947,7 +33358,7 @@ index 000000000..3659b2c08 + + mutex_unlock(&c->btree_interior_update_lock); + -+ mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent); ++ mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); + six_unlock_write(&b->c.lock); + + btree_node_write_if_need(c, b, SIX_LOCK_intent); @@ -32612,7 +35023,7 @@ index 000000000..3659b2c08 + as, + as->mode, + as->nodes_written, -+ atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, ++ closure_nr_remaining(&as->cl), + as->journal.seq); + mutex_unlock(&c->btree_interior_update_lock); +} @@ -33056,2115 +35467,12 @@ index 000000000..5e0a467fe +int bch2_fs_btree_interior_update_init(struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ -diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c -new file mode 100644 -index 000000000..368972a00 ---- /dev/null -+++ b/fs/bcachefs/btree_update_leaf.c -@@ -0,0 +1,2097 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_gc.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_locking.h" -+#include "btree_write_buffer.h" -+#include "buckets.h" -+#include "debug.h" -+#include "errcode.h" -+#include "error.h" -+#include "extent_update.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+#include "keylist.h" -+#include "recovery.h" -+#include "subvolume.h" -+#include "replicas.h" -+#include "trace.h" -+ -+#include -+#include -+ -+/* -+ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a -+ * different snapshot: -+ */ -+static struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) -+{ -+ struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); -+ -+ if (k.k && bpos_eq(path->pos, k.k->p)) -+ return k; -+ -+ bkey_init(u); -+ u->p = path->pos; -+ return (struct bkey_s_c) { u, NULL }; -+} -+ -+static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bch_fs *c = trans->c; -+ struct bkey u; -+ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); -+ -+ if (unlikely(trans->journal_replay_not_finished)) { -+ struct bkey_i *j_k = -+ bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); -+ -+ if (j_k) -+ k = bkey_i_to_s_c(j_k); -+ } -+ -+ u = *k.k; -+ u.needs_whiteout = i->old_k.needs_whiteout; -+ -+ BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); -+ BUG_ON(i->old_v != k.v); -+#endif -+} -+ -+static int __must_check -+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, -+ struct bkey_i *, enum btree_update_flags, -+ unsigned long ip); -+ -+static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, -+ const struct btree_insert_entry *r) -+{ -+ return cmp_int(l->btree_id, r->btree_id) ?: -+ cmp_int(l->cached, r->cached) ?: -+ -cmp_int(l->level, r->level) ?: -+ bpos_cmp(l->k->k.p, r->k->k.p); -+} -+ -+static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) -+{ -+ return i->path->l + i->level; -+} -+ -+static inline bool same_leaf_as_prev(struct btree_trans *trans, -+ struct btree_insert_entry *i) -+{ -+ return i != trans->updates && -+ insert_l(&i[0])->b == insert_l(&i[-1])->b; -+} -+ -+static inline bool same_leaf_as_next(struct btree_trans *trans, -+ struct btree_insert_entry *i) -+{ -+ return i + 1 < trans->updates + trans->nr_updates && -+ insert_l(&i[0])->b == insert_l(&i[1])->b; -+} -+ -+inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b) -+{ -+ struct bch_fs *c = trans->c; -+ -+ if (unlikely(btree_node_just_written(b)) && -+ bch2_btree_post_write_cleanup(c, b)) -+ bch2_trans_node_reinit_iter(trans, b); -+ -+ /* -+ * If the last bset has been written, or if it's gotten too big - start -+ * a new bset to insert into: -+ */ -+ if (want_new_bset(c, b)) -+ bch2_btree_init_next(trans, b); -+} -+ -+/* Inserting into a given leaf node (last stage of insert): */ -+ -+/* Handle overwrites and do insert, for non extents: */ -+bool bch2_btree_bset_insert_key(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b, -+ struct btree_node_iter *node_iter, -+ struct bkey_i *insert) -+{ -+ struct bkey_packed *k; -+ unsigned clobber_u64s = 0, new_u64s = 0; -+ -+ EBUG_ON(btree_node_just_written(b)); -+ EBUG_ON(bset_written(b, btree_bset_last(b))); -+ EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); -+ EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); -+ EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); -+ EBUG_ON(insert->k.u64s > -+ bch_btree_keys_u64s_remaining(trans->c, b)); -+ -+ k = bch2_btree_node_iter_peek_all(node_iter, b); -+ if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) -+ k = NULL; -+ -+ /* @k is the key being overwritten/deleted, if any: */ -+ EBUG_ON(k && bkey_deleted(k)); -+ -+ /* Deleting, but not found? nothing to do: */ -+ if (bkey_deleted(&insert->k) && !k) -+ return false; -+ -+ if (bkey_deleted(&insert->k)) { -+ /* Deleting: */ -+ btree_account_key_drop(b, k); -+ k->type = KEY_TYPE_deleted; -+ -+ if (k->needs_whiteout) -+ push_whiteout(trans->c, b, insert->k.p); -+ k->needs_whiteout = false; -+ -+ if (k >= btree_bset_last(b)->start) { -+ clobber_u64s = k->u64s; -+ bch2_bset_delete(b, k, clobber_u64s); -+ goto fix_iter; -+ } else { -+ bch2_btree_path_fix_key_modified(trans, b, k); -+ } -+ -+ return true; -+ } -+ -+ if (k) { -+ /* Overwriting: */ -+ btree_account_key_drop(b, k); -+ k->type = KEY_TYPE_deleted; -+ -+ insert->k.needs_whiteout = k->needs_whiteout; -+ k->needs_whiteout = false; -+ -+ if (k >= btree_bset_last(b)->start) { -+ clobber_u64s = k->u64s; -+ goto overwrite; -+ } else { -+ bch2_btree_path_fix_key_modified(trans, b, k); -+ } -+ } -+ -+ k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); -+overwrite: -+ bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); -+ new_u64s = k->u64s; -+fix_iter: -+ if (clobber_u64s != new_u64s) -+ bch2_btree_node_iter_fix(trans, path, b, node_iter, k, -+ clobber_u64s, new_u64s); -+ return true; -+} -+ -+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, -+ unsigned i, u64 seq) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct btree_write *w = container_of(pin, struct btree_write, journal); -+ struct btree *b = container_of(w, struct btree, writes[i]); -+ struct btree_trans trans; -+ unsigned long old, new, v; -+ unsigned idx = w - b->writes; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); -+ v = READ_ONCE(b->flags); -+ -+ do { -+ old = new = v; -+ -+ if (!(old & (1 << BTREE_NODE_dirty)) || -+ !!(old & (1 << BTREE_NODE_write_idx)) != idx || -+ w->journal.seq != seq) -+ break; -+ -+ new &= ~BTREE_WRITE_TYPE_MASK; -+ new |= BTREE_WRITE_journal_reclaim; -+ new |= 1 << BTREE_NODE_need_write; -+ } while ((v = cmpxchg(&b->flags, old, new)) != old); -+ -+ btree_node_write_if_need(c, b, SIX_LOCK_read); -+ six_unlock_read(&b->c.lock); -+ -+ bch2_trans_exit(&trans); -+ return 0; -+} -+ -+int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) -+{ -+ return __btree_node_flush(j, pin, 0, seq); -+} -+ -+int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) -+{ -+ return __btree_node_flush(j, pin, 1, seq); -+} -+ -+inline void bch2_btree_add_journal_pin(struct bch_fs *c, -+ struct btree *b, u64 seq) -+{ -+ struct btree_write *w = btree_current_write(b); -+ -+ bch2_journal_pin_add(&c->journal, seq, &w->journal, -+ btree_node_write_idx(b) == 0 -+ ? bch2_btree_node_flush0 -+ : bch2_btree_node_flush1); -+} -+ -+/** -+ * btree_insert_key - insert a key one key into a leaf node -+ */ -+inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, -+ struct btree_path *path, -+ struct bkey_i *insert, -+ u64 journal_seq) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b = path_l(path)->b; -+ struct bset_tree *t = bset_tree_last(b); -+ struct bset *i = bset(b, t); -+ int old_u64s = bset_u64s(t); -+ int old_live_u64s = b->nr.live_u64s; -+ int live_u64s_added, u64s_added; -+ -+ if (unlikely(!bch2_btree_bset_insert_key(trans, path, b, -+ &path_l(path)->iter, insert))) -+ return; -+ -+ i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq))); -+ -+ bch2_btree_add_journal_pin(c, b, journal_seq); -+ -+ if (unlikely(!btree_node_dirty(b))) { -+ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); -+ set_btree_node_dirty_acct(c, b); -+ } -+ -+ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; -+ u64s_added = (int) bset_u64s(t) - old_u64s; -+ -+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); -+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); -+ -+ if (u64s_added > live_u64s_added && -+ bch2_maybe_compact_whiteouts(c, b)) -+ bch2_trans_node_reinit_iter(trans, b); -+} -+ -+/* Cached btree updates: */ -+ -+/* Normal update interface: */ -+ -+static inline void btree_insert_entry_checks(struct btree_trans *trans, -+ struct btree_insert_entry *i) -+{ -+ BUG_ON(!bpos_eq(i->k->k.p, i->path->pos)); -+ BUG_ON(i->cached != i->path->cached); -+ BUG_ON(i->level != i->path->level); -+ BUG_ON(i->btree_id != i->path->btree_id); -+ EBUG_ON(!i->level && -+ !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && -+ test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && -+ i->k->k.p.snapshot && -+ bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); -+} -+ -+static noinline int -+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags, -+ unsigned long trace_ip) -+{ -+ return drop_locks_do(trans, -+ bch2_journal_preres_get(&trans->c->journal, -+ &trans->journal_preres, -+ trans->journal_preres_u64s, -+ (flags & BCH_WATERMARK_MASK))); -+} -+ -+static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, -+ unsigned flags) -+{ -+ return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, -+ trans->journal_u64s, flags); -+} -+ -+#define JSET_ENTRY_LOG_U64s 4 -+ -+static noinline void journal_transaction_name(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct journal *j = &c->journal; -+ struct jset_entry *entry = -+ bch2_journal_add_entry(j, &trans->journal_res, -+ BCH_JSET_ENTRY_log, 0, 0, -+ JSET_ENTRY_LOG_U64s); -+ struct jset_entry_log *l = -+ container_of(entry, struct jset_entry_log, entry); -+ -+ strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); -+} -+ -+static inline int btree_key_can_insert(struct btree_trans *trans, -+ struct btree *b, unsigned u64s) -+{ -+ struct bch_fs *c = trans->c; -+ -+ if (!bch2_btree_node_insert_fits(c, b, u64s)) -+ return -BCH_ERR_btree_insert_btree_node_full; -+ -+ return 0; -+} -+ -+static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, -+ struct btree_path *path, unsigned u64s) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_cached *ck = (void *) path->l[0].b; -+ struct btree_insert_entry *i; -+ unsigned new_u64s; -+ struct bkey_i *new_k; -+ -+ EBUG_ON(path->level); -+ -+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && -+ bch2_btree_key_cache_must_wait(c) && -+ !(flags & BTREE_INSERT_JOURNAL_RECLAIM)) -+ return -BCH_ERR_btree_insert_need_journal_reclaim; -+ -+ /* -+ * bch2_varint_decode can read past the end of the buffer by at most 7 -+ * bytes (it won't be used): -+ */ -+ u64s += 1; -+ -+ if (u64s <= ck->u64s) -+ return 0; -+ -+ new_u64s = roundup_pow_of_two(u64s); -+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); -+ if (!new_k) { -+ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", -+ bch2_btree_ids[path->btree_id], new_u64s); -+ return -BCH_ERR_ENOMEM_btree_key_cache_insert; -+ } -+ -+ trans_for_each_update(trans, i) -+ if (i->old_v == &ck->k->v) -+ i->old_v = &new_k->v; -+ -+ ck->u64s = new_u64s; -+ ck->k = new_k; -+ return 0; -+} -+ -+/* Triggers: */ -+ -+static int run_one_mem_trigger(struct btree_trans *trans, -+ struct btree_insert_entry *i, -+ unsigned flags) -+{ -+ struct bkey_s_c old = { &i->old_k, i->old_v }; -+ struct bkey_i *new = i->k; -+ const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); -+ const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); -+ int ret; -+ -+ verify_update_old_key(trans, i); -+ -+ if (unlikely(flags & BTREE_TRIGGER_NORUN)) -+ return 0; -+ -+ if (!btree_node_type_needs_gc(i->btree_id)) -+ return 0; -+ -+ if (old_ops->atomic_trigger == new_ops->atomic_trigger && -+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { -+ ret = bch2_mark_key(trans, i->btree_id, i->level, -+ old, bkey_i_to_s_c(new), -+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); -+ } else { -+ struct bkey _deleted = KEY(0, 0, 0); -+ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; -+ -+ _deleted.p = i->path->pos; -+ -+ ret = bch2_mark_key(trans, i->btree_id, i->level, -+ deleted, bkey_i_to_s_c(new), -+ BTREE_TRIGGER_INSERT|flags) ?: -+ bch2_mark_key(trans, i->btree_id, i->level, -+ old, deleted, -+ BTREE_TRIGGER_OVERWRITE|flags); -+ } -+ -+ return ret; -+} -+ -+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, -+ bool overwrite) -+{ -+ /* -+ * Transactional triggers create new btree_insert_entries, so we can't -+ * pass them a pointer to a btree_insert_entry, that memory is going to -+ * move: -+ */ -+ struct bkey old_k = i->old_k; -+ struct bkey_s_c old = { &old_k, i->old_v }; -+ const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); -+ const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); -+ -+ verify_update_old_key(trans, i); -+ -+ if ((i->flags & BTREE_TRIGGER_NORUN) || -+ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) -+ return 0; -+ -+ if (!i->insert_trigger_run && -+ !i->overwrite_trigger_run && -+ old_ops->trans_trigger == new_ops->trans_trigger && -+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { -+ i->overwrite_trigger_run = true; -+ i->insert_trigger_run = true; -+ return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, -+ BTREE_TRIGGER_INSERT| -+ BTREE_TRIGGER_OVERWRITE| -+ i->flags) ?: 1; -+ } else if (overwrite && !i->overwrite_trigger_run) { -+ i->overwrite_trigger_run = true; -+ return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; -+ } else if (!overwrite && !i->insert_trigger_run) { -+ i->insert_trigger_run = true; -+ return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; -+ } else { -+ return 0; -+ } -+} -+ -+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, -+ struct btree_insert_entry *btree_id_start) -+{ -+ struct btree_insert_entry *i; -+ bool trans_trigger_run; -+ int ret, overwrite; -+ -+ for (overwrite = 1; overwrite >= 0; --overwrite) { -+ -+ /* -+ * Running triggers will append more updates to the list of updates as -+ * we're walking it: -+ */ -+ do { -+ trans_trigger_run = false; -+ -+ for (i = btree_id_start; -+ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; -+ i++) { -+ if (i->btree_id != btree_id) -+ continue; -+ -+ ret = run_one_trans_trigger(trans, i, overwrite); -+ if (ret < 0) -+ return ret; -+ if (ret) -+ trans_trigger_run = true; -+ } -+ } while (trans_trigger_run); -+ } -+ -+ return 0; -+} -+ -+static int bch2_trans_commit_run_triggers(struct btree_trans *trans) -+{ -+ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; -+ unsigned btree_id = 0; -+ int ret = 0; -+ -+ /* -+ * -+ * For a given btree, this algorithm runs insert triggers before -+ * overwrite triggers: this is so that when extents are being moved -+ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before -+ * they are re-added. -+ */ -+ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { -+ if (btree_id == BTREE_ID_alloc) -+ continue; -+ -+ while (btree_id_start < trans->updates + trans->nr_updates && -+ btree_id_start->btree_id < btree_id) -+ btree_id_start++; -+ -+ ret = run_btree_triggers(trans, btree_id, btree_id_start); -+ if (ret) -+ return ret; -+ } -+ -+ trans_for_each_update(trans, i) { -+ if (i->btree_id > BTREE_ID_alloc) -+ break; -+ if (i->btree_id == BTREE_ID_alloc) { -+ ret = run_btree_triggers(trans, BTREE_ID_alloc, i); -+ if (ret) -+ return ret; -+ break; -+ } -+ } -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans_for_each_update(trans, i) -+ BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && -+ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && -+ (!i->insert_trigger_run || !i->overwrite_trigger_run)); -+#endif -+ return 0; -+} -+ -+static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ int ret = 0; -+ -+ trans_for_each_update(trans, i) { -+ /* -+ * XXX: synchronization of cached update triggers with gc -+ * XXX: synchronization of interior node updates with gc -+ */ -+ BUG_ON(i->cached || i->level); -+ -+ if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { -+ ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); -+ if (ret) -+ break; -+ } -+ } -+ -+ return ret; -+} -+ -+static inline int -+bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, -+ struct btree_insert_entry **stopped_at, -+ unsigned long trace_ip) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ struct btree_write_buffered_key *wb; -+ struct btree_trans_commit_hook *h; -+ unsigned u64s = 0; -+ bool marking = false; -+ int ret; -+ -+ if (race_fault()) { -+ trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); -+ return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); -+ } -+ -+ /* -+ * Check if the insert will fit in the leaf node with the write lock -+ * held, otherwise another thread could write the node changing the -+ * amount of space available: -+ */ -+ -+ prefetch(&trans->c->journal.flags); -+ -+ trans_for_each_update(trans, i) { -+ /* Multiple inserts might go to same leaf: */ -+ if (!same_leaf_as_prev(trans, i)) -+ u64s = 0; -+ -+ u64s += i->k->k.u64s; -+ ret = !i->cached -+ ? btree_key_can_insert(trans, insert_l(i)->b, u64s) -+ : btree_key_can_insert_cached(trans, flags, i->path, u64s); -+ if (ret) { -+ *stopped_at = i; -+ return ret; -+ } -+ -+ if (btree_node_type_needs_gc(i->bkey_type)) -+ marking = true; -+ } -+ -+ if (trans->nr_wb_updates && -+ trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) -+ return -BCH_ERR_btree_insert_need_flush_buffer; -+ -+ /* -+ * Don't get journal reservation until after we know insert will -+ * succeed: -+ */ -+ if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { -+ ret = bch2_trans_journal_res_get(trans, -+ (flags & BCH_WATERMARK_MASK)| -+ JOURNAL_RES_GET_NONBLOCK); -+ if (ret) -+ return ret; -+ -+ if (unlikely(trans->journal_transaction_names)) -+ journal_transaction_name(trans); -+ } else { -+ trans->journal_res.seq = c->journal.replay_journal_seq; -+ } -+ -+ /* -+ * Not allowed to fail after we've gotten our journal reservation - we -+ * have to use it: -+ */ -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && -+ !(flags & BTREE_INSERT_JOURNAL_REPLAY)) { -+ if (bch2_journal_seq_verify) -+ trans_for_each_update(trans, i) -+ i->k->k.version.lo = trans->journal_res.seq; -+ else if (bch2_inject_invalid_keys) -+ trans_for_each_update(trans, i) -+ i->k->k.version = MAX_VERSION; -+ } -+ -+ if (trans->fs_usage_deltas && -+ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) -+ return -BCH_ERR_btree_insert_need_mark_replicas; -+ -+ if (trans->nr_wb_updates) { -+ EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY); -+ -+ ret = bch2_btree_insert_keys_write_buffer(trans); -+ if (ret) -+ goto revert_fs_usage; -+ } -+ -+ h = trans->hooks; -+ while (h) { -+ ret = h->fn(trans, h); -+ if (ret) -+ goto revert_fs_usage; -+ h = h->next; -+ } -+ -+ trans_for_each_update(trans, i) -+ if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { -+ ret = run_one_mem_trigger(trans, i, i->flags); -+ if (ret) -+ goto fatal_err; -+ } -+ -+ if (unlikely(c->gc_pos.phase)) { -+ ret = bch2_trans_commit_run_gc_triggers(trans); -+ if (ret) -+ goto fatal_err; -+ } -+ -+ if (unlikely(trans->extra_journal_entries.nr)) { -+ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), -+ trans->extra_journal_entries.data, -+ trans->extra_journal_entries.nr); -+ -+ trans->journal_res.offset += trans->extra_journal_entries.nr; -+ trans->journal_res.u64s -= trans->extra_journal_entries.nr; -+ } -+ -+ if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { -+ struct journal *j = &c->journal; -+ struct jset_entry *entry; -+ -+ trans_for_each_update(trans, i) { -+ if (i->key_cache_already_flushed) -+ continue; -+ -+ if (i->flags & BTREE_UPDATE_NOJOURNAL) -+ continue; -+ -+ verify_update_old_key(trans, i); -+ -+ if (trans->journal_transaction_names) { -+ entry = bch2_journal_add_entry(j, &trans->journal_res, -+ BCH_JSET_ENTRY_overwrite, -+ i->btree_id, i->level, -+ i->old_k.u64s); -+ bkey_reassemble(&entry->start[0], -+ (struct bkey_s_c) { &i->old_k, i->old_v }); -+ } -+ -+ entry = bch2_journal_add_entry(j, &trans->journal_res, -+ BCH_JSET_ENTRY_btree_keys, -+ i->btree_id, i->level, -+ i->k->k.u64s); -+ bkey_copy(&entry->start[0], i->k); -+ } -+ -+ trans_for_each_wb_update(trans, wb) { -+ entry = bch2_journal_add_entry(j, &trans->journal_res, -+ BCH_JSET_ENTRY_btree_keys, -+ wb->btree, 0, -+ wb->k.k.u64s); -+ bkey_copy(&entry->start[0], &wb->k); -+ } -+ -+ if (trans->journal_seq) -+ *trans->journal_seq = trans->journal_res.seq; -+ } -+ -+ trans_for_each_update(trans, i) { -+ i->k->k.needs_whiteout = false; -+ -+ if (!i->cached) { -+ u64 seq = trans->journal_res.seq; -+ -+ if (i->flags & BTREE_UPDATE_PREJOURNAL) -+ seq = i->seq; -+ -+ bch2_btree_insert_key_leaf(trans, i->path, i->k, seq); -+ } else if (!i->key_cache_already_flushed) -+ bch2_btree_insert_key_cached(trans, flags, i); -+ else { -+ bch2_btree_key_cache_drop(trans, i->path); -+ btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); -+ } -+ } -+ -+ return 0; -+fatal_err: -+ bch2_fatal_error(c); -+revert_fs_usage: -+ if (trans->fs_usage_deltas) -+ bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas); -+ return ret; -+} -+ -+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) -+{ -+ while (--i >= trans->updates) { -+ if (same_leaf_as_prev(trans, i)) -+ continue; -+ -+ bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); -+ } -+ -+ trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); -+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); -+} -+ -+static inline int trans_lock_write(struct btree_trans *trans) -+{ -+ struct btree_insert_entry *i; -+ -+ trans_for_each_update(trans, i) { -+ if (same_leaf_as_prev(trans, i)) -+ continue; -+ -+ if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) -+ return trans_lock_write_fail(trans, i); -+ -+ if (!i->cached) -+ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); -+ } -+ -+ return 0; -+} -+ -+static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) -+{ -+ struct btree_insert_entry *i; -+ struct btree_write_buffered_key *wb; -+ -+ trans_for_each_update(trans, i) -+ bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); -+ -+ trans_for_each_wb_update(trans, wb) -+ bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags, -+ struct btree_insert_entry *i, -+ struct printbuf *err) -+{ -+ struct bch_fs *c = trans->c; -+ int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; -+ -+ printbuf_reset(err); -+ prt_printf(err, "invalid bkey on insert from %s -> %ps", -+ trans->fn, (void *) i->ip_allocated); -+ prt_newline(err); -+ printbuf_indent_add(err, 2); -+ -+ bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); -+ prt_newline(err); -+ -+ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), -+ i->bkey_type, rw, err); -+ bch2_print_string_as_lines(KERN_ERR, err->buf); -+ -+ bch2_inconsistent_error(c); -+ bch2_dump_trans_updates(trans); -+ printbuf_exit(err); -+ -+ return -EINVAL; -+} -+#endif -+ -+/* -+ * Get journal reservation, take write locks, and attempt to do btree update(s): -+ */ -+static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, -+ struct btree_insert_entry **stopped_at, -+ unsigned long trace_ip) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ int ret = 0, u64s_delta = 0; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans_for_each_update(trans, i) { -+ struct printbuf buf = PRINTBUF; -+ enum bkey_invalid_flags invalid_flags = 0; -+ -+ if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) -+ invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; -+ -+ if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), -+ i->bkey_type, invalid_flags, &buf))) -+ ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf); -+ btree_insert_entry_checks(trans, i); -+ printbuf_exit(&buf); -+ -+ if (ret) -+ return ret; -+ } -+#endif -+ -+ trans_for_each_update(trans, i) { -+ if (i->cached) -+ continue; -+ -+ u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; -+ u64s_delta -= i->old_btree_u64s; -+ -+ if (!same_leaf_as_next(trans, i)) { -+ if (u64s_delta <= 0) { -+ ret = bch2_foreground_maybe_merge(trans, i->path, -+ i->level, flags); -+ if (unlikely(ret)) -+ return ret; -+ } -+ -+ u64s_delta = 0; -+ } -+ } -+ -+ ret = bch2_journal_preres_get(&c->journal, -+ &trans->journal_preres, trans->journal_preres_u64s, -+ (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK); -+ if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked)) -+ ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip); -+ if (unlikely(ret)) -+ return ret; -+ -+ ret = trans_lock_write(trans); -+ if (unlikely(ret)) -+ return ret; -+ -+ ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); -+ -+ if (!ret && unlikely(trans->journal_replay_not_finished)) -+ bch2_drop_overwrites_from_journal(trans); -+ -+ trans_for_each_update(trans, i) -+ if (!same_leaf_as_prev(trans, i)) -+ bch2_btree_node_unlock_write_inlined(trans, i->path, -+ insert_l(i)->b); -+ -+ if (!ret && trans->journal_pin) -+ bch2_journal_pin_add(&c->journal, trans->journal_res.seq, -+ trans->journal_pin, NULL); -+ -+ /* -+ * Drop journal reservation after dropping write locks, since dropping -+ * the journal reservation may kick off a journal write: -+ */ -+ bch2_journal_res_put(&c->journal, &trans->journal_res); -+ -+ if (unlikely(ret)) -+ return ret; -+ -+ bch2_trans_downgrade(trans); -+ -+ return 0; -+} -+ -+static int journal_reclaim_wait_done(struct bch_fs *c) -+{ -+ int ret = bch2_journal_error(&c->journal) ?: -+ !bch2_btree_key_cache_must_wait(c); -+ -+ if (!ret) -+ journal_reclaim_kick(&c->journal); -+ return ret; -+} -+ -+static noinline -+int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, -+ struct btree_insert_entry *i, -+ int ret, unsigned long trace_ip) -+{ -+ struct bch_fs *c = trans->c; -+ -+ switch (ret) { -+ case -BCH_ERR_btree_insert_btree_node_full: -+ ret = bch2_btree_split_leaf(trans, i->path, flags); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); -+ break; -+ case -BCH_ERR_btree_insert_need_mark_replicas: -+ ret = drop_locks_do(trans, -+ bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); -+ break; -+ case -BCH_ERR_journal_res_get_blocked: -+ /* -+ * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK -+ * flag -+ */ -+ if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && -+ (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { -+ ret = -BCH_ERR_journal_reclaim_would_deadlock; -+ break; -+ } -+ -+ ret = drop_locks_do(trans, -+ bch2_trans_journal_res_get(trans, -+ (flags & BCH_WATERMARK_MASK)| -+ JOURNAL_RES_GET_CHECK)); -+ break; -+ case -BCH_ERR_btree_insert_need_journal_reclaim: -+ bch2_trans_unlock(trans); -+ -+ trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); -+ -+ wait_event_freezable(c->journal.reclaim_wait, -+ (ret = journal_reclaim_wait_done(c))); -+ if (ret < 0) -+ break; -+ -+ ret = bch2_trans_relock(trans); -+ break; -+ case -BCH_ERR_btree_insert_need_flush_buffer: { -+ struct btree_write_buffer *wb = &c->btree_write_buffer; -+ -+ ret = 0; -+ -+ if (wb->state.nr > wb->size * 3 / 4) { -+ bch2_trans_unlock(trans); -+ mutex_lock(&wb->flush_lock); -+ -+ if (wb->state.nr > wb->size * 3 / 4) { -+ bch2_trans_begin(trans); -+ ret = __bch2_btree_write_buffer_flush(trans, -+ flags|BTREE_INSERT_NOCHECK_RW, true); -+ if (!ret) { -+ trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); -+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); -+ } -+ } else { -+ mutex_unlock(&wb->flush_lock); -+ ret = bch2_trans_relock(trans); -+ } -+ } -+ break; -+ } -+ default: -+ BUG_ON(ret >= 0); -+ break; -+ } -+ -+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); -+ -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && -+ !(flags & BTREE_INSERT_NOWAIT) && -+ (flags & BTREE_INSERT_NOFAIL), c, -+ "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); -+ -+ return ret; -+} -+ -+static noinline int -+bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ int ret; -+ -+ if (likely(!(flags & BTREE_INSERT_LAZY_RW)) || -+ test_bit(BCH_FS_STARTED, &c->flags)) -+ return -BCH_ERR_erofs_trans_commit; -+ -+ ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); -+ if (ret) -+ return ret; -+ -+ bch2_write_ref_get(c, BCH_WRITE_REF_trans); -+ return 0; -+} -+ -+/* -+ * This is for updates done in the early part of fsck - btree_gc - before we've -+ * gone RW. we only add the new key to the list of keys for journal replay to -+ * do. -+ */ -+static noinline int -+do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ int ret = 0; -+ -+ trans_for_each_update(trans, i) { -+ ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); -+ if (ret) -+ break; -+ } -+ -+ return ret; -+} -+ -+int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i = NULL; -+ struct btree_write_buffered_key *wb; -+ unsigned u64s; -+ int ret = 0; -+ -+ if (!trans->nr_updates && -+ !trans->nr_wb_updates && -+ !trans->extra_journal_entries.nr) -+ goto out_reset; -+ -+ if (flags & BTREE_INSERT_GC_LOCK_HELD) -+ lockdep_assert_held(&c->gc_lock); -+ -+ ret = bch2_trans_commit_run_triggers(trans); -+ if (ret) -+ goto out_reset; -+ -+ if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { -+ ret = do_bch2_trans_commit_to_journal_replay(trans); -+ goto out_reset; -+ } -+ -+ if (!(flags & BTREE_INSERT_NOCHECK_RW) && -+ unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { -+ ret = bch2_trans_commit_get_rw_cold(trans, flags); -+ if (ret) -+ goto out_reset; -+ } -+ -+ if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && -+ mutex_trylock(&c->btree_write_buffer.flush_lock)) { -+ bch2_trans_begin(trans); -+ bch2_trans_unlock(trans); -+ -+ ret = __bch2_btree_write_buffer_flush(trans, -+ flags|BTREE_INSERT_NOCHECK_RW, true); -+ if (!ret) { -+ trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); -+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); -+ } -+ goto out; -+ } -+ -+ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); -+ -+ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); -+ -+ trans->journal_u64s = trans->extra_journal_entries.nr; -+ trans->journal_preres_u64s = 0; -+ -+ trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); -+ -+ if (trans->journal_transaction_names) -+ trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); -+ -+ trans_for_each_update(trans, i) { -+ EBUG_ON(!i->path->should_be_locked); -+ -+ ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); -+ if (unlikely(ret)) -+ goto out; -+ -+ EBUG_ON(!btree_node_intent_locked(i->path, i->level)); -+ -+ if (i->key_cache_already_flushed) -+ continue; -+ -+ /* we're going to journal the key being updated: */ -+ u64s = jset_u64s(i->k->k.u64s); -+ if (i->cached && -+ likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) -+ trans->journal_preres_u64s += u64s; -+ -+ if (i->flags & BTREE_UPDATE_NOJOURNAL) -+ continue; -+ -+ trans->journal_u64s += u64s; -+ -+ /* and we're also going to log the overwrite: */ -+ if (trans->journal_transaction_names) -+ trans->journal_u64s += jset_u64s(i->old_k.u64s); -+ } -+ -+ trans_for_each_wb_update(trans, wb) -+ trans->journal_u64s += jset_u64s(wb->k.k.u64s); -+ -+ if (trans->extra_journal_res) { -+ ret = bch2_disk_reservation_add(c, trans->disk_res, -+ trans->extra_journal_res, -+ (flags & BTREE_INSERT_NOFAIL) -+ ? BCH_DISK_RESERVATION_NOFAIL : 0); -+ if (ret) -+ goto err; -+ } -+retry: -+ bch2_trans_verify_not_in_restart(trans); -+ memset(&trans->journal_res, 0, sizeof(trans->journal_res)); -+ -+ ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); -+ -+ /* make sure we didn't drop or screw up locks: */ -+ bch2_trans_verify_locks(trans); -+ -+ if (ret) -+ goto err; -+ -+ trace_and_count(c, transaction_commit, trans, _RET_IP_); -+out: -+ bch2_journal_preres_put(&c->journal, &trans->journal_preres); -+ -+ if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) -+ bch2_write_ref_put(c, BCH_WRITE_REF_trans); -+out_reset: -+ bch2_trans_reset_updates(trans); -+ -+ return ret; -+err: -+ ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_); -+ if (ret) -+ goto out; -+ -+ goto retry; -+} -+ -+static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans, -+ enum btree_id id, -+ struct bpos pos) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_iter_init(trans, &iter, id, pos, -+ BTREE_ITER_NOT_EXTENTS| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ while (1) { -+ k = bch2_btree_iter_prev(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ -+ if (!k.k) -+ break; -+ -+ if (!bkey_eq(pos, k.k->p)) -+ break; -+ -+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { -+ ret = 1; -+ break; -+ } -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ return ret; -+} -+ -+static inline int check_pos_snapshot_overwritten(struct btree_trans *trans, -+ enum btree_id id, -+ struct bpos pos) -+{ -+ if (!btree_type_has_snapshots(id) || -+ bch2_snapshot_is_leaf(trans->c, pos.snapshot)) -+ return 0; -+ -+ return __check_pos_snapshot_overwritten(trans, id, pos); -+} -+ -+static noinline int extent_front_merge(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ struct bkey_i **insert, -+ enum btree_update_flags flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_i *update; -+ int ret; -+ -+ update = bch2_bkey_make_mut_noupdate(trans, k); -+ ret = PTR_ERR_OR_ZERO(update); -+ if (ret) -+ return ret; -+ -+ if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) -+ return 0; -+ -+ ret = check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?: -+ check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p); -+ if (ret < 0) -+ return ret; -+ if (ret) -+ return 0; -+ -+ ret = bch2_btree_delete_at(trans, iter, flags); -+ if (ret) -+ return ret; -+ -+ *insert = update; -+ return 0; -+} -+ -+static noinline int extent_back_merge(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ int ret; -+ -+ ret = check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?: -+ check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p); -+ if (ret < 0) -+ return ret; -+ if (ret) -+ return 0; -+ -+ bch2_bkey_merge(c, bkey_i_to_s(insert), k); -+ return 0; -+} -+ -+/* -+ * When deleting, check if we need to emit a whiteout (because we're overwriting -+ * something in an ancestor snapshot) -+ */ -+static int need_whiteout_for_snapshot(struct btree_trans *trans, -+ enum btree_id btree_id, struct bpos pos) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ u32 snapshot = pos.snapshot; -+ int ret; -+ -+ if (!bch2_snapshot_parent(trans->c, pos.snapshot)) -+ return 0; -+ -+ pos.snapshot++; -+ -+ for_each_btree_key_norestart(trans, iter, btree_id, pos, -+ BTREE_ITER_ALL_SNAPSHOTS| -+ BTREE_ITER_NOPRESERVE, k, ret) { -+ if (!bkey_eq(k.k->p, pos)) -+ break; -+ -+ if (bch2_snapshot_is_ancestor(trans->c, snapshot, -+ k.k->p.snapshot)) { -+ ret = !bkey_whiteout(k.k); -+ break; -+ } -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ return ret; -+} -+ -+int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, -+ enum btree_id id, -+ struct bpos old_pos, -+ struct bpos new_pos) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter old_iter, new_iter = { NULL }; -+ struct bkey_s_c old_k, new_k; -+ snapshot_id_list s; -+ struct bkey_i *update; -+ int ret; -+ -+ if (!bch2_snapshot_has_children(c, old_pos.snapshot)) -+ return 0; -+ -+ darray_init(&s); -+ -+ bch2_trans_iter_init(trans, &old_iter, id, old_pos, -+ BTREE_ITER_NOT_EXTENTS| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ while ((old_k = bch2_btree_iter_prev(&old_iter)).k && -+ !(ret = bkey_err(old_k)) && -+ bkey_eq(old_pos, old_k.k->p)) { -+ struct bpos whiteout_pos = -+ SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; -+ -+ if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || -+ snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) -+ continue; -+ -+ new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, -+ BTREE_ITER_NOT_EXTENTS| -+ BTREE_ITER_INTENT); -+ ret = bkey_err(new_k); -+ if (ret) -+ break; -+ -+ if (new_k.k->type == KEY_TYPE_deleted) { -+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); -+ ret = PTR_ERR_OR_ZERO(update); -+ if (ret) -+ break; -+ -+ bkey_init(&update->k); -+ update->k.p = whiteout_pos; -+ update->k.type = KEY_TYPE_whiteout; -+ -+ ret = bch2_trans_update(trans, &new_iter, update, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+ } -+ bch2_trans_iter_exit(trans, &new_iter); -+ -+ ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(trans, &new_iter); -+ bch2_trans_iter_exit(trans, &old_iter); -+ darray_exit(&s); -+ -+ return ret; -+} -+ -+int bch2_trans_update_extent(struct btree_trans *trans, -+ struct btree_iter *orig_iter, -+ struct bkey_i *insert, -+ enum btree_update_flags flags) -+{ -+ struct btree_iter iter; -+ struct bpos start = bkey_start_pos(&insert->k); -+ struct bkey_i *update; -+ struct bkey_s_c k; -+ enum btree_id btree_id = orig_iter->btree_id; -+ int ret = 0, compressed_sectors; -+ -+ bch2_trans_iter_init(trans, &iter, btree_id, start, -+ BTREE_ITER_INTENT| -+ BTREE_ITER_WITH_UPDATES| -+ BTREE_ITER_NOT_EXTENTS); -+ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); -+ if ((ret = bkey_err(k))) -+ goto err; -+ if (!k.k) -+ goto out; -+ -+ if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) { -+ if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { -+ ret = extent_front_merge(trans, &iter, k, &insert, flags); -+ if (ret) -+ goto err; -+ } -+ -+ goto next; -+ } -+ -+ while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { -+ bool front_split = bkey_lt(bkey_start_pos(k.k), start); -+ bool back_split = bkey_gt(k.k->p, insert->k.p); -+ -+ /* -+ * If we're going to be splitting a compressed extent, note it -+ * so that __bch2_trans_commit() can increase our disk -+ * reservation: -+ */ -+ if (((front_split && back_split) || -+ ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) && -+ (compressed_sectors = bch2_bkey_sectors_compressed(k))) -+ trans->extra_journal_res += compressed_sectors; -+ -+ if (front_split) { -+ update = bch2_bkey_make_mut_noupdate(trans, k); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ goto err; -+ -+ bch2_cut_back(start, update); -+ -+ ret = bch2_insert_snapshot_whiteouts(trans, btree_id, -+ k.k->p, update->k.p) ?: -+ bch2_btree_insert_nonextent(trans, btree_id, update, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); -+ if (ret) -+ goto err; -+ } -+ -+ if (k.k->p.snapshot != insert->k.p.snapshot && -+ (front_split || back_split)) { -+ update = bch2_bkey_make_mut_noupdate(trans, k); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ goto err; -+ -+ bch2_cut_front(start, update); -+ bch2_cut_back(insert->k.p, update); -+ -+ ret = bch2_insert_snapshot_whiteouts(trans, btree_id, -+ k.k->p, update->k.p) ?: -+ bch2_btree_insert_nonextent(trans, btree_id, update, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); -+ if (ret) -+ goto err; -+ } -+ -+ if (bkey_le(k.k->p, insert->k.p)) { -+ update = bch2_trans_kmalloc(trans, sizeof(*update)); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ goto err; -+ -+ bkey_init(&update->k); -+ update->k.p = k.k->p; -+ update->k.p.snapshot = insert->k.p.snapshot; -+ -+ if (insert->k.p.snapshot != k.k->p.snapshot) { -+ update->k.type = KEY_TYPE_whiteout; -+ } else if (btree_type_has_snapshots(btree_id)) { -+ ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); -+ if (ret < 0) -+ goto err; -+ if (ret) -+ update->k.type = KEY_TYPE_whiteout; -+ } -+ -+ ret = bch2_btree_insert_nonextent(trans, btree_id, update, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); -+ if (ret) -+ goto err; -+ } -+ -+ if (back_split) { -+ update = bch2_bkey_make_mut_noupdate(trans, k); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ goto err; -+ -+ bch2_cut_front(insert->k.p, update); -+ -+ ret = bch2_trans_update_by_path(trans, iter.path, update, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| -+ flags, _RET_IP_); -+ if (ret) -+ goto err; -+ goto out; -+ } -+next: -+ bch2_btree_iter_advance(&iter); -+ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); -+ if ((ret = bkey_err(k))) -+ goto err; -+ if (!k.k) -+ goto out; -+ } -+ -+ if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { -+ ret = extent_back_merge(trans, &iter, insert, k); -+ if (ret) -+ goto err; -+ } -+out: -+ if (!bkey_deleted(&insert->k)) { -+ /* -+ * Rewinding iterators is expensive: get a new one and the one -+ * that points to the start of insert will be cloned from: -+ */ -+ bch2_trans_iter_exit(trans, &iter); -+ bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p, -+ BTREE_ITER_NOT_EXTENTS| -+ BTREE_ITER_INTENT); -+ ret = bch2_btree_iter_traverse(&iter) ?: -+ bch2_trans_update(trans, &iter, insert, flags); -+ } -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ -+ return ret; -+} -+ -+static noinline int flush_new_cached_update(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree_insert_entry *i, -+ enum btree_update_flags flags, -+ unsigned long ip) -+{ -+ struct btree_path *btree_path; -+ struct bkey k; -+ int ret; -+ -+ btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, -+ BTREE_ITER_INTENT, _THIS_IP_); -+ ret = bch2_btree_path_traverse(trans, btree_path, 0); -+ if (ret) -+ goto out; -+ -+ /* -+ * The old key in the insert entry might actually refer to an existing -+ * key in the btree that has been deleted from cache and not yet -+ * flushed. Check for this and skip the flush so we don't run triggers -+ * against a stale key. -+ */ -+ bch2_btree_path_peek_slot_exact(btree_path, &k); -+ if (!bkey_deleted(&k)) -+ goto out; -+ -+ i->key_cache_already_flushed = true; -+ i->flags |= BTREE_TRIGGER_NORUN; -+ -+ btree_path_set_should_be_locked(btree_path); -+ ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip); -+out: -+ bch2_path_put(trans, btree_path, true); -+ return ret; -+} -+ -+static int __must_check -+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, -+ struct bkey_i *k, enum btree_update_flags flags, -+ unsigned long ip) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i, n; -+ u64 seq = 0; -+ int cmp; -+ -+ EBUG_ON(!path->should_be_locked); -+ EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); -+ EBUG_ON(!bpos_eq(k->k.p, path->pos)); -+ -+ /* -+ * The transaction journal res hasn't been allocated at this point. -+ * That occurs at commit time. Reuse the seq field to pass in the seq -+ * of a prejournaled key. -+ */ -+ if (flags & BTREE_UPDATE_PREJOURNAL) -+ seq = trans->journal_res.seq; -+ -+ n = (struct btree_insert_entry) { -+ .flags = flags, -+ .bkey_type = __btree_node_type(path->level, path->btree_id), -+ .btree_id = path->btree_id, -+ .level = path->level, -+ .cached = path->cached, -+ .path = path, -+ .k = k, -+ .seq = seq, -+ .ip_allocated = ip, -+ }; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans_for_each_update(trans, i) -+ BUG_ON(i != trans->updates && -+ btree_insert_entry_cmp(i - 1, i) >= 0); -+#endif -+ -+ /* -+ * Pending updates are kept sorted: first, find position of new update, -+ * then delete/trim any updates the new update overwrites: -+ */ -+ trans_for_each_update(trans, i) { -+ cmp = btree_insert_entry_cmp(&n, i); -+ if (cmp <= 0) -+ break; -+ } -+ -+ if (!cmp && i < trans->updates + trans->nr_updates) { -+ EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); -+ -+ bch2_path_put(trans, i->path, true); -+ i->flags = n.flags; -+ i->cached = n.cached; -+ i->k = n.k; -+ i->path = n.path; -+ i->seq = n.seq; -+ i->ip_allocated = n.ip_allocated; -+ } else { -+ array_insert_item(trans->updates, trans->nr_updates, -+ i - trans->updates, n); -+ -+ i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v; -+ i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; -+ -+ if (unlikely(trans->journal_replay_not_finished)) { -+ struct bkey_i *j_k = -+ bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); -+ -+ if (j_k) { -+ i->old_k = j_k->k; -+ i->old_v = &j_k->v; -+ } -+ } -+ } -+ -+ __btree_path_get(i->path, true); -+ -+ /* -+ * If a key is present in the key cache, it must also exist in the -+ * btree - this is necessary for cache coherency. When iterating over -+ * a btree that's cached in the key cache, the btree iter code checks -+ * the key cache - but the key has to exist in the btree for that to -+ * work: -+ */ -+ if (path->cached && bkey_deleted(&i->old_k)) -+ return flush_new_cached_update(trans, path, i, flags, ip); -+ -+ return 0; -+} -+ -+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_i *k, enum btree_update_flags flags) -+{ -+ struct btree_path *path = iter->update_path ?: iter->path; -+ struct bkey_cached *ck; -+ int ret; -+ -+ if (iter->flags & BTREE_ITER_IS_EXTENTS) -+ return bch2_trans_update_extent(trans, iter, k, flags); -+ -+ if (bkey_deleted(&k->k) && -+ !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && -+ (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { -+ ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); -+ if (unlikely(ret < 0)) -+ return ret; -+ -+ if (ret) -+ k->k.type = KEY_TYPE_whiteout; -+ } -+ -+ /* -+ * Ensure that updates to cached btrees go to the key cache: -+ */ -+ if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && -+ !path->cached && -+ !path->level && -+ btree_id_cached(trans->c, path->btree_id)) { -+ if (!iter->key_cache_path || -+ !iter->key_cache_path->should_be_locked || -+ !bpos_eq(iter->key_cache_path->pos, k->k.p)) { -+ if (!iter->key_cache_path) -+ iter->key_cache_path = -+ bch2_path_get(trans, path->btree_id, path->pos, 1, 0, -+ BTREE_ITER_INTENT| -+ BTREE_ITER_CACHED, _THIS_IP_); -+ -+ iter->key_cache_path = -+ bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, -+ iter->flags & BTREE_ITER_INTENT, -+ _THIS_IP_); -+ -+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, -+ BTREE_ITER_CACHED); -+ if (unlikely(ret)) -+ return ret; -+ -+ ck = (void *) iter->key_cache_path->l[0].b; -+ -+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); -+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); -+ } -+ -+ btree_path_set_should_be_locked(iter->key_cache_path); -+ } -+ -+ path = iter->key_cache_path; -+ } -+ -+ return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); -+} -+ -+/* -+ * Add a transaction update for a key that has already been journaled. -+ */ -+int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq, -+ struct btree_iter *iter, struct bkey_i *k, -+ enum btree_update_flags flags) -+{ -+ trans->journal_res.seq = seq; -+ return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL| -+ BTREE_UPDATE_PREJOURNAL); -+} -+ -+int __must_check bch2_trans_update_buffered(struct btree_trans *trans, -+ enum btree_id btree, -+ struct bkey_i *k) -+{ -+ struct btree_write_buffered_key *i; -+ int ret; -+ -+ EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); -+ EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); -+ -+ trans_for_each_wb_update(trans, i) { -+ if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { -+ bkey_copy(&i->k, k); -+ return 0; -+ } -+ } -+ -+ if (!trans->wb_updates || -+ trans->nr_wb_updates == trans->wb_updates_size) { -+ struct btree_write_buffered_key *u; -+ -+ if (trans->nr_wb_updates == trans->wb_updates_size) { -+ struct btree_transaction_stats *s = btree_trans_stats(trans); -+ -+ BUG_ON(trans->wb_updates_size > U8_MAX / 2); -+ trans->wb_updates_size = max(1, trans->wb_updates_size * 2); -+ if (s) -+ s->wb_updates_size = trans->wb_updates_size; -+ } -+ -+ u = bch2_trans_kmalloc_nomemzero(trans, -+ trans->wb_updates_size * -+ sizeof(struct btree_write_buffered_key)); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ return ret; -+ -+ if (trans->nr_wb_updates) -+ memcpy(u, trans->wb_updates, trans->nr_wb_updates * -+ sizeof(struct btree_write_buffered_key)); -+ trans->wb_updates = u; -+ } -+ -+ trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { -+ .btree = btree, -+ }; -+ -+ bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); -+ trans->nr_wb_updates++; -+ -+ return 0; -+} -+ -+int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, -+ enum btree_id btree, struct bpos end) -+{ -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT); -+ k = bch2_btree_iter_prev(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ bch2_btree_iter_advance(iter); -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ BUG_ON(k.k->type != KEY_TYPE_deleted); -+ -+ if (bkey_gt(k.k->p, end)) { -+ ret = -BCH_ERR_ENOSPC_btree_slot; -+ goto err; -+ } -+ -+ return 0; -+err: -+ bch2_trans_iter_exit(trans, iter); -+ return ret; -+} -+ -+void bch2_trans_commit_hook(struct btree_trans *trans, -+ struct btree_trans_commit_hook *h) -+{ -+ h->next = trans->hooks; -+ trans->hooks = h; -+} -+ -+int bch2_btree_insert_nonextent(struct btree_trans *trans, -+ enum btree_id btree, struct bkey_i *k, -+ enum btree_update_flags flags) -+{ -+ struct btree_iter iter; -+ int ret; -+ -+ bch2_trans_iter_init(trans, &iter, btree, k->k.p, -+ BTREE_ITER_NOT_EXTENTS| -+ BTREE_ITER_INTENT); -+ ret = bch2_btree_iter_traverse(&iter) ?: -+ bch2_trans_update(trans, &iter, k, flags); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, -+ struct bkey_i *k, enum btree_update_flags flags) -+{ -+ struct btree_iter iter; -+ int ret; -+ -+ bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), -+ BTREE_ITER_CACHED| -+ BTREE_ITER_INTENT); -+ ret = bch2_btree_iter_traverse(&iter) ?: -+ bch2_trans_update(trans, &iter, k, flags); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+/** -+ * bch2_btree_insert - insert keys into the extent btree -+ * @c: pointer to struct bch_fs -+ * @id: btree to insert into -+ * @insert_keys: list of keys to insert -+ * @hook: insert callback -+ */ -+int bch2_btree_insert(struct bch_fs *c, enum btree_id id, -+ struct bkey_i *k, -+ struct disk_reservation *disk_res, -+ u64 *journal_seq, int flags) -+{ -+ return bch2_trans_do(c, disk_res, journal_seq, flags, -+ __bch2_btree_insert(&trans, id, k, 0)); -+} -+ -+int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, -+ unsigned len, unsigned update_flags) -+{ -+ struct bkey_i *k; -+ -+ k = bch2_trans_kmalloc(trans, sizeof(*k)); -+ if (IS_ERR(k)) -+ return PTR_ERR(k); -+ -+ bkey_init(&k->k); -+ k->k.p = iter->pos; -+ bch2_key_resize(&k->k, len); -+ return bch2_trans_update(trans, iter, k, update_flags); -+} -+ -+int bch2_btree_delete_at(struct btree_trans *trans, -+ struct btree_iter *iter, unsigned update_flags) -+{ -+ return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); -+} -+ -+int bch2_btree_delete_at_buffered(struct btree_trans *trans, -+ enum btree_id btree, struct bpos pos) -+{ -+ struct bkey_i *k; -+ -+ k = bch2_trans_kmalloc(trans, sizeof(*k)); -+ if (IS_ERR(k)) -+ return PTR_ERR(k); -+ -+ bkey_init(&k->k); -+ k->k.p = pos; -+ return bch2_trans_update_buffered(trans, btree, k); -+} -+ -+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, -+ struct bpos start, struct bpos end, -+ unsigned update_flags, -+ u64 *journal_seq) -+{ -+ u32 restart_count = trans->restart_count; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); -+ while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { -+ struct disk_reservation disk_res = -+ bch2_disk_reservation_init(trans->c, 0); -+ struct bkey_i delete; -+ -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ bkey_init(&delete.k); -+ -+ /* -+ * This could probably be more efficient for extents: -+ */ -+ -+ /* -+ * For extents, iter.pos won't necessarily be the same as -+ * bkey_start_pos(k.k) (for non extents they always will be the -+ * same). It's important that we delete starting from iter.pos -+ * because the range we want to delete could start in the middle -+ * of k. -+ * -+ * (bch2_btree_iter_peek() does guarantee that iter.pos >= -+ * bkey_start_pos(k.k)). -+ */ -+ delete.k.p = iter.pos; -+ -+ if (iter.flags & BTREE_ITER_IS_EXTENTS) -+ bch2_key_resize(&delete.k, -+ bpos_min(end, k.k->p).offset - -+ iter.pos.offset); -+ -+ ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: -+ bch2_trans_commit(trans, &disk_res, journal_seq, -+ BTREE_INSERT_NOFAIL); -+ bch2_disk_reservation_put(trans->c, &disk_res); -+err: -+ /* -+ * the bch2_trans_begin() call is in a weird place because we -+ * need to call it after every transaction commit, to avoid path -+ * overflow, but don't want to call it if the delete operation -+ * is a no-op and we have no work to do: -+ */ -+ bch2_trans_begin(trans); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ ret = 0; -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (!ret && trans_was_restarted(trans, restart_count)) -+ ret = -BCH_ERR_transaction_restart_nested; -+ return ret; -+} -+ -+/* -+ * bch_btree_delete_range - delete everything within a given range -+ * -+ * Range is a half open interval - [start, end) -+ */ -+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, -+ struct bpos start, struct bpos end, -+ unsigned update_flags, -+ u64 *journal_seq) -+{ -+ int ret = bch2_trans_run(c, -+ bch2_btree_delete_range_trans(&trans, id, start, end, -+ update_flags, journal_seq)); -+ if (ret == -BCH_ERR_transaction_restart_nested) -+ ret = 0; -+ return ret; -+} -+ -+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, -+ struct bpos pos, bool set) -+{ -+ struct bkey_i *k; -+ int ret = 0; -+ -+ k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); -+ ret = PTR_ERR_OR_ZERO(k); -+ if (unlikely(ret)) -+ return ret; -+ -+ bkey_init(&k->k); -+ k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; -+ k->k.p = pos; -+ -+ return bch2_trans_update_buffered(trans, btree, k); -+} -+ -+static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) -+{ -+ struct printbuf buf = PRINTBUF; -+ struct jset_entry_log *l; -+ unsigned u64s; -+ int ret; -+ -+ prt_vprintf(&buf, fmt, args); -+ ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; -+ if (ret) -+ goto err; -+ -+ u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); -+ -+ ret = darray_make_room(entries, jset_u64s(u64s)); -+ if (ret) -+ goto err; -+ -+ l = (void *) &darray_top(*entries); -+ l->entry.u64s = cpu_to_le16(u64s); -+ l->entry.btree_id = 0; -+ l->entry.level = 1; -+ l->entry.type = BCH_JSET_ENTRY_log; -+ l->entry.pad[0] = 0; -+ l->entry.pad[1] = 0; -+ l->entry.pad[2] = 0; -+ memcpy(l->d, buf.buf, buf.pos); -+ while (buf.pos & 7) -+ l->d[buf.pos++] = '\0'; -+ -+ entries->nr += jset_u64s(u64s); -+err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static int -+__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, -+ va_list args) -+{ -+ int ret; -+ -+ if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { -+ ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); -+ } else { -+ ret = bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_LAZY_RW|commit_flags, -+ __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args)); -+ } -+ -+ return ret; -+} -+ -+int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) -+{ -+ va_list args; -+ int ret; -+ -+ va_start(args, fmt); -+ ret = __bch2_fs_log_msg(c, 0, fmt, args); -+ va_end(args); -+ return ret; -+} -+ -+/* -+ * Use for logging messages during recovery to enable reserved space and avoid -+ * blocking. -+ */ -+int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) -+{ -+ va_list args; -+ int ret; -+ -+ va_start(args, fmt); -+ ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args); -+ va_end(args); -+ return ret; -+} diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c new file mode 100644 -index 000000000..5f96db539 +index 000000000..6d2d43b6f --- /dev/null +++ b/fs/bcachefs/btree_write_buffer.c -@@ -0,0 +1,372 @@ +@@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -35242,7 +35550,8 @@ index 000000000..5f96db539 + } + return 0; +trans_commit: -+ return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, 0) ?: ++ return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, + commit_flags| + BTREE_INSERT_NOCHECK_RW| @@ -35291,7 +35600,8 @@ index 000000000..5f96db539 + BTREE_ITER_CACHED|BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&iter) ?: -+ bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, 0); ++ bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &iter); + return ret; +} @@ -35360,7 +35670,8 @@ index 000000000..5f96db539 + + if (!iter.path || iter.path->btree_id != i->btree) { + bch2_trans_iter_exit(trans, &iter); -+ bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT); ++ bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, ++ BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); + } + + bch2_btree_iter_set_pos(&iter, i->k.k.p); @@ -35609,10 +35920,10 @@ index 000000000..99993ba77 +#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c new file mode 100644 -index 000000000..7bb7f0cae +index 000000000..c02c8c917 --- /dev/null +++ b/fs/bcachefs/buckets.c -@@ -0,0 +1,2106 @@ +@@ -0,0 +1,2107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for manipulating bucket marks for garbage collection. @@ -37539,6 +37850,7 @@ index 000000000..7bb7f0cae +int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) +{ + int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca)); ++ + if (ret) + bch_err_fn(c, ret); + return ret; @@ -37721,10 +38033,10 @@ index 000000000..7bb7f0cae +} diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h new file mode 100644 -index 000000000..a418f6648 +index 000000000..f192809f5 --- /dev/null +++ b/fs/bcachefs/buckets.h -@@ -0,0 +1,368 @@ +@@ -0,0 +1,413 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Code for manipulating bucket marks for garbage collection. @@ -37737,7 +38049,31 @@ index 000000000..a418f6648 + +#include "buckets_types.h" +#include "extents.h" -+#include "super.h" ++#include "sb-members.h" ++ ++static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) ++{ ++ return div_u64(s, ca->mi.bucket_size); ++} ++ ++static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) ++{ ++ return ((sector_t) b) * ca->mi.bucket_size; ++} ++ ++static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) ++{ ++ u32 remainder; ++ ++ div_u64_rem(s, ca->mi.bucket_size, &remainder); ++ return remainder; ++} ++ ++static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, ++ u32 *offset) ++{ ++ return div_u64_rem(s, ca->mi.bucket_size, offset); ++} + +#define for_each_bucket(_b, _buckets) \ + for (_b = (_buckets)->b + (_buckets)->first_bucket; \ @@ -38019,6 +38355,27 @@ index 000000000..a418f6648 + size_t, enum bch_data_type, unsigned); +int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); + ++static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) ++{ ++ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; ++ u64 b_offset = bucket_to_sector(ca, b); ++ u64 b_end = bucket_to_sector(ca, b + 1); ++ unsigned i; ++ ++ if (!b) ++ return true; ++ ++ for (i = 0; i < layout->nr_superblocks; i++) { ++ u64 offset = le64_to_cpu(layout->sb_offset[i]); ++ u64 end = offset + (1 << layout->sb_max_size_bits); ++ ++ if (!(offset >= b_end || end <= b_offset)) ++ return true; ++ } ++ ++ return false; ++} ++ +/* disk reservations: */ + +static inline void bch2_disk_reservation_put(struct bch_fs *c, @@ -39190,7 +39547,7 @@ index 000000000..fb603df09 +#endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h new file mode 100644 -index 000000000..3a4890d39 +index 000000000..0f563ca53 --- /dev/null +++ b/fs/bcachefs/chardev.h @@ -0,0 +1,31 @@ @@ -39213,7 +39570,7 @@ index 000000000..3a4890d39 +static inline long bch2_fs_ioctl(struct bch_fs *c, + unsigned cmd, void __user * arg) +{ -+ return -ENOSYS; ++ return -ENOTTY; +} + +static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} @@ -39227,10 +39584,10 @@ index 000000000..3a4890d39 +#endif /* _BCACHEFS_CHARDEV_H */ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c new file mode 100644 -index 000000000..a08997a5b +index 000000000..36939020f --- /dev/null +++ b/fs/bcachefs/checksum.c -@@ -0,0 +1,709 @@ +@@ -0,0 +1,753 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "checksum.h" @@ -39498,9 +39855,10 @@ index 000000000..a08997a5b + +#ifdef CONFIG_HIGHMEM + __bio_for_each_segment(bv, bio, *iter, *iter) { -+ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; ++ void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; ++ + bch2_checksum_update(&state, p, bv.bv_len); -+ kunmap_atomic(p); ++ kunmap_local(p); + } +#else + __bio_for_each_bvec(bv, bio, *iter, *iter) @@ -39520,10 +39878,10 @@ index 000000000..a08997a5b + +#ifdef CONFIG_HIGHMEM + __bio_for_each_segment(bv, bio, *iter, *iter) { -+ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; ++ void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; + + crypto_shash_update(desc, p, bv.bv_len); -+ kunmap_atomic(p); ++ kunmap_local(p); + } +#else + __bio_for_each_bvec(bv, bio, *iter, *iter) @@ -39593,7 +39951,7 @@ index 000000000..a08997a5b + + state.type = type; + bch2_checksum_init(&state); -+ state.seed = a.lo; ++ state.seed = (u64 __force) a.lo; + + BUG_ON(!bch2_checksum_mergeable(type)); + @@ -39604,7 +39962,7 @@ index 000000000..a08997a5b + page_address(ZERO_PAGE(0)), b); + b_len -= b; + } -+ a.lo = bch2_checksum_final(&state); ++ a.lo = (__le64 __force) bch2_checksum_final(&state); + a.lo ^= b.lo; + a.hi ^= b.hi; + return a; @@ -39659,9 +40017,10 @@ index 000000000..a08997a5b + merged = bch2_checksum_bio(c, crc_old.csum_type, + extent_nonce(version, crc_old), bio); + -+ if (bch2_crc_cmp(merged, crc_old.csum)) { -+ bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n" ++ if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { ++ bch_err(c, "checksum error in %s() (memory corruption or bug?)\n" + "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)", ++ __func__, + crc_old.csum.hi, + crc_old.csum.lo, + merged.hi, @@ -39691,6 +40050,48 @@ index 000000000..a08997a5b + return 0; +} + ++/* BCH_SB_FIELD_crypt: */ ++ ++static int bch2_sb_crypt_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); ++ ++ if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { ++ prt_printf(err, "wrong size (got %zu should be %zu)", ++ vstruct_bytes(&crypt->field), sizeof(*crypt)); ++ return -BCH_ERR_invalid_sb_crypt; ++ } ++ ++ if (BCH_CRYPT_KDF_TYPE(crypt)) { ++ prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); ++ return -BCH_ERR_invalid_sb_crypt; ++ } ++ ++ return 0; ++} ++ ++static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); ++ ++ prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); ++ prt_newline(out); ++ prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); ++ prt_newline(out); ++ prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); ++ prt_newline(out); ++ prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); ++ prt_newline(out); ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_crypt = { ++ .validate = bch2_sb_crypt_validate, ++ .to_text = bch2_sb_crypt_to_text, ++}; ++ +#ifdef __KERNEL__ +static int __bch2_request_key(char *key_description, struct bch_key *key) +{ @@ -39830,7 +40231,7 @@ index 000000000..a08997a5b + if (ret) + goto out; + -+ crypt->key.magic = BCH_KEY_MAGIC; ++ crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC); + crypt->key.key = key; + + SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); @@ -39858,7 +40259,7 @@ index 000000000..a08997a5b + if (ret) + goto err; + -+ key.magic = BCH_KEY_MAGIC; ++ key.magic = cpu_to_le64(BCH_KEY_MAGIC); + get_random_bytes(&key.key, sizeof(key.key)); + + if (keyed) { @@ -39942,10 +40343,10 @@ index 000000000..a08997a5b +} diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h new file mode 100644 -index 000000000..1ad1d5f03 +index 000000000..c7b1a8fca --- /dev/null +++ b/fs/bcachefs/checksum.h -@@ -0,0 +1,209 @@ +@@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_CHECKSUM_H +#define _BCACHEFS_CHECKSUM_H @@ -40020,6 +40421,8 @@ index 000000000..1ad1d5f03 + : 0; +} + ++extern const struct bch_sb_field_ops bch_sb_field_ops_crypt; ++ +int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, + struct bch_key *); + @@ -40443,10 +40846,10 @@ index 000000000..5fae0012d +#endif /* _BCACHEFS_CLOCK_TYPES_H */ diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c new file mode 100644 -index 000000000..c9ca7cce5 +index 000000000..6b17f7cc5 --- /dev/null +++ b/fs/bcachefs/compress.c -@@ -0,0 +1,713 @@ +@@ -0,0 +1,714 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "checksum.h" @@ -41092,7 +41495,8 @@ index 000000000..c9ca7cce5 +static u64 compression_opt_to_feature(unsigned v) +{ + unsigned type = bch2_compression_decode(v).type; -+ return 1ULL << bch2_compression_opt_to_feature[type]; ++ ++ return BIT_ULL(bch2_compression_opt_to_feature[type]); +} + +int bch2_fs_compress_init(struct bch_fs *c) @@ -41359,7 +41763,7 @@ index 000000000..4778aa19b +#endif // _BCACHEFS_COUNTERS_H diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h new file mode 100644 -index 000000000..d4485fa01 +index 000000000..114f86b45 --- /dev/null +++ b/fs/bcachefs/darray.h @@ -0,0 +1,87 @@ @@ -41424,13 +41828,13 @@ index 000000000..d4485fa01 +#define darray_first(_d) ((_d).data[0]) +#define darray_last(_d) ((_d).data[(_d).nr - 1]) + -+#define darray_insert_item(_d, _pos, _item) \ ++#define darray_insert_item(_d, pos, _item) \ +({ \ -+ size_t pos = (_pos); \ ++ size_t _pos = (pos); \ + int _ret = darray_make_room((_d), 1); \ + \ + if (!_ret) \ -+ array_insert_item((_d)->data, (_d)->nr, pos, (_item)); \ ++ array_insert_item((_d)->data, (_d)->nr, _pos, (_item)); \ + _ret; \ +}) + @@ -41452,7 +41856,7 @@ index 000000000..d4485fa01 +#endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c new file mode 100644 -index 000000000..cfc624463 +index 000000000..81518f20d --- /dev/null +++ b/fs/bcachefs/data_update.c @@ -0,0 +1,562 @@ @@ -41873,7 +42277,7 @@ index 000000000..cfc624463 + break; + } + -+ if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) { ++ if (closure_nr_remaining(&cl) != 1) { + bch2_trans_unlock(trans); + closure_sync(&cl); + } @@ -43070,10 +43474,10 @@ index 000000000..2c37143b5 +#endif /* _BCACHEFS_DEBUG_H */ diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c new file mode 100644 -index 000000000..065ea59ee +index 000000000..a7559ab03 --- /dev/null +++ b/fs/bcachefs/dirent.c -@@ -0,0 +1,565 @@ +@@ -0,0 +1,590 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -43089,12 +43493,25 @@ index 000000000..065ea59ee + +#include + -+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) ++static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) +{ -+ unsigned len = bkey_val_bytes(d.k) - -+ offsetof(struct bch_dirent, d_name); ++ unsigned bkey_u64s = bkey_val_u64s(d.k); ++ unsigned bkey_bytes = bkey_u64s * sizeof(u64); ++ u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1]; ++#if CPU_BIG_ENDIAN ++ unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8; ++#else ++ unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8; ++#endif + -+ return strnlen(d.v->d_name, len); ++ return bkey_bytes - ++ offsetof(struct bch_dirent, d_name) - ++ trailing_nuls; ++} ++ ++struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d) ++{ ++ return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); +} + +static u64 bch2_dirent_hash(const struct bch_hash_info *info, @@ -43117,7 +43534,7 @@ index 000000000..065ea59ee +static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); ++ struct qstr name = bch2_dirent_get_name(d); + + return bch2_dirent_hash(info, &name); +} @@ -43125,20 +43542,20 @@ index 000000000..065ea59ee +static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) +{ + struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); -+ int len = bch2_dirent_name_bytes(l); -+ const struct qstr *r = _r; ++ const struct qstr l_name = bch2_dirent_get_name(l); ++ const struct qstr *r_name = _r; + -+ return len - r->len ?: memcmp(l.v->d_name, r->name, len); ++ return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len); +} + +static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) +{ + struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); + struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); -+ int l_len = bch2_dirent_name_bytes(l); -+ int r_len = bch2_dirent_name_bytes(r); ++ const struct qstr l_name = bch2_dirent_get_name(l); ++ const struct qstr r_name = bch2_dirent_get_name(r); + -+ return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); ++ return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len); +} + +static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) @@ -43165,37 +43582,45 @@ index 000000000..065ea59ee + struct printbuf *err) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ unsigned len; ++ struct qstr d_name = bch2_dirent_get_name(d); + -+ len = bch2_dirent_name_bytes(d); -+ if (!len) { ++ if (!d_name.len) { + prt_printf(err, "empty name"); + return -BCH_ERR_invalid_bkey; + } + -+ if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) { ++ if (bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len)) { + prt_printf(err, "value too big (%zu > %u)", -+ bkey_val_u64s(k.k), dirent_val_u64s(len)); ++ bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); + return -BCH_ERR_invalid_bkey; + } + -+ if (len > BCH_NAME_MAX) { ++ /* ++ * Check new keys don't exceed the max length ++ * (older keys may be larger.) ++ */ ++ if ((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX) { + prt_printf(err, "dirent name too big (%u > %u)", -+ len, BCH_NAME_MAX); ++ d_name.len, BCH_NAME_MAX); + return -BCH_ERR_invalid_bkey; + } + -+ if (len == 1 && !memcmp(d.v->d_name, ".", 1)) { ++ if (d_name.len != strnlen(d_name.name, d_name.len)) { ++ prt_printf(err, "dirent has stray data after name's NUL"); ++ return -BCH_ERR_invalid_bkey; ++ } ++ ++ if (d_name.len == 1 && !memcmp(d_name.name, ".", 1)) { + prt_printf(err, "invalid name"); + return -BCH_ERR_invalid_bkey; + } + -+ if (len == 2 && !memcmp(d.v->d_name, "..", 2)) { ++ if (d_name.len == 2 && !memcmp(d_name.name, "..", 2)) { + prt_printf(err, "invalid name"); + return -BCH_ERR_invalid_bkey; + } + -+ if (memchr(d.v->d_name, '/', len)) { ++ if (memchr(d_name.name, '/', d_name.len)) { + prt_printf(err, "invalid name"); + return -BCH_ERR_invalid_bkey; + } @@ -43213,10 +43638,11 @@ index 000000000..065ea59ee + struct bkey_s_c k) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ struct qstr d_name = bch2_dirent_get_name(d); + + prt_printf(out, "%.*s -> %llu type %s", -+ bch2_dirent_name_bytes(d), -+ d.v->d_name, ++ d_name.len, ++ d_name.name, + d.v->d_type != DT_SUBVOL + ? le64_to_cpu(d.v->d_inum) + : le32_to_cpu(d.v->d_child_subvol), @@ -43583,6 +44009,7 @@ index 000000000..065ea59ee + subvol_inum target; + u32 snapshot; + struct bkey_buf sk; ++ struct qstr name; + int ret; + + bch2_bkey_buf_init(&sk); @@ -43613,9 +44040,11 @@ index 000000000..065ea59ee + dirent = bkey_i_to_s_c_dirent(sk.k); + bch2_trans_unlock(&trans); + ++ name = bch2_dirent_get_name(dirent); ++ + ctx->pos = dirent.k->p.offset; -+ if (!dir_emit(ctx, dirent.v->d_name, -+ bch2_dirent_name_bytes(dirent), ++ if (!dir_emit(ctx, name.name, ++ name.len, + target.inum, + vfs_d_type(dirent.v->d_type))) + break; @@ -43641,7 +44070,7 @@ index 000000000..065ea59ee +} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h new file mode 100644 -index 000000000..b42f4a13b +index 000000000..e9fa1df38 --- /dev/null +++ b/fs/bcachefs/dirent.h @@ -0,0 +1,70 @@ @@ -43671,7 +44100,7 @@ index 000000000..b42f4a13b +struct bch_hash_info; +struct bch_inode_info; + -+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent); ++struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); + +static inline unsigned dirent_val_u64s(unsigned len) +{ @@ -43717,13 +44146,14 @@ index 000000000..b42f4a13b +#endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c new file mode 100644 -index 000000000..de14ca3a9 +index 000000000..f36472c4a --- /dev/null +++ b/fs/bcachefs/disk_groups.c -@@ -0,0 +1,555 @@ +@@ -0,0 +1,556 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "disk_groups.h" ++#include "sb-members.h" +#include "super-io.h" + +#include @@ -44390,10 +44820,10 @@ index 000000000..bd7711767 +#endif /* _BCACHEFS_DISK_GROUPS_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c new file mode 100644 -index 000000000..efbb7cf7a +index 000000000..f58e84a2b --- /dev/null +++ b/fs/bcachefs/ec.c -@@ -0,0 +1,1960 @@ +@@ -0,0 +1,1972 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* erasure coding */ @@ -44596,11 +45026,14 @@ index 000000000..efbb7cf7a + +static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) +{ -+ unsigned i; ++ if (buf->key.k.type == KEY_TYPE_stripe) { ++ struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key); ++ unsigned i; + -+ for (i = 0; i < buf->key.v.nr_blocks; i++) { -+ kvpfree(buf->data[i], buf->size << 9); -+ buf->data[i] = NULL; ++ for (i = 0; i < s->v.nr_blocks; i++) { ++ kvpfree(buf->data[i], buf->size << 9); ++ buf->data[i] = NULL; ++ } + } +} + @@ -44608,7 +45041,7 @@ index 000000000..efbb7cf7a +static int ec_stripe_buf_init(struct ec_stripe_buf *buf, + unsigned offset, unsigned size) +{ -+ struct bch_stripe *v = &buf->key.v; ++ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + unsigned csum_granularity = 1U << v->csum_granularity_bits; + unsigned end = offset + size; + unsigned i; @@ -44624,7 +45057,7 @@ index 000000000..efbb7cf7a + + memset(buf->valid, 0xFF, sizeof(buf->valid)); + -+ for (i = 0; i < buf->key.v.nr_blocks; i++) { ++ for (i = 0; i < v->nr_blocks; i++) { + buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); + if (!buf->data[i]) + goto err; @@ -44641,7 +45074,7 @@ index 000000000..efbb7cf7a +static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, + unsigned block, unsigned offset) +{ -+ struct bch_stripe *v = &buf->key.v; ++ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + unsigned csum_granularity = 1 << v->csum_granularity_bits; + unsigned end = buf->offset + buf->size; + unsigned len = min(csum_granularity, end - offset); @@ -44660,7 +45093,7 @@ index 000000000..efbb7cf7a + +static void ec_generate_checksums(struct ec_stripe_buf *buf) +{ -+ struct bch_stripe *v = &buf->key.v; ++ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + unsigned i, j, csums_per_device = stripe_csums_per_device(v); + + if (!v->csum_type) @@ -44677,7 +45110,7 @@ index 000000000..efbb7cf7a + +static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) +{ -+ struct bch_stripe *v = &buf->key.v; ++ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + unsigned csum_granularity = 1 << v->csum_granularity_bits; + unsigned i; + @@ -44700,7 +45133,7 @@ index 000000000..efbb7cf7a + if (bch2_crc_cmp(want, got)) { + struct printbuf buf2 = PRINTBUF; + -+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i)); ++ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key)); + + bch_err_ratelimited(c, + "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", @@ -44720,7 +45153,7 @@ index 000000000..efbb7cf7a + +static void ec_generate_ec(struct ec_stripe_buf *buf) +{ -+ struct bch_stripe *v = &buf->key.v; ++ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + unsigned nr_data = v->nr_blocks - v->nr_redundant; + unsigned bytes = le16_to_cpu(v->sectors) << 9; + @@ -44729,13 +45162,14 @@ index 000000000..efbb7cf7a + +static unsigned ec_nr_failed(struct ec_stripe_buf *buf) +{ -+ return buf->key.v.nr_blocks - -+ bitmap_weight(buf->valid, buf->key.v.nr_blocks); ++ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; ++ ++ return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks); +} + +static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) +{ -+ struct bch_stripe *v = &buf->key.v; ++ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; + unsigned nr_data = v->nr_blocks - v->nr_redundant; + unsigned bytes = buf->size << 9; @@ -44759,7 +45193,7 @@ index 000000000..efbb7cf7a +static void ec_block_endio(struct bio *bio) +{ + struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); -+ struct bch_stripe *v = &ec_bio->buf->key.v; ++ struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v; + struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; + struct bch_dev *ca = ec_bio->ca; + struct closure *cl = bio->bi_private; @@ -44784,11 +45218,11 @@ index 000000000..efbb7cf7a +static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, + blk_opf_t opf, unsigned idx, struct closure *cl) +{ -+ struct bch_stripe *v = &buf->key.v; ++ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + unsigned offset = 0, bytes = buf->size << 9; + struct bch_extent_ptr *ptr = &v->ptrs[idx]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant ++ enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant + ? BCH_DATA_user + : BCH_DATA_parity; + int rw = op_is_write(opf); @@ -44859,7 +45293,7 @@ index 000000000..efbb7cf7a + ret = -ENOENT; + goto err; + } -+ bkey_reassemble(&stripe->key.k_i, k); ++ bkey_reassemble(&stripe->key, k); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; @@ -44895,7 +45329,7 @@ index 000000000..efbb7cf7a + return -EIO; + } + -+ v = &buf->key.v; ++ v = &bkey_i_to_stripe(&buf->key)->v; + + if (!bch2_ptr_matches_stripe(v, rbio->pick)) { + bch_err_ratelimited(c, @@ -45271,6 +45705,7 @@ index 000000000..efbb7cf7a + struct ec_stripe_buf *s, + struct bpos *bp_pos) +{ ++ struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; + struct bch_fs *c = trans->c; + struct bch_backpointer bp; + struct btree_iter iter; @@ -45322,7 +45757,7 @@ index 000000000..efbb7cf7a + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) + goto out; + -+ ptr_c = bkey_matches_stripe(&s->key.v, k, &block); ++ ptr_c = bkey_matches_stripe(v, k, &block); + /* + * It doesn't generally make sense to erasure code cached ptrs: + * XXX: should we be incrementing a counter? @@ -45330,7 +45765,7 @@ index 000000000..efbb7cf7a + if (!ptr_c || ptr_c->cached) + goto out; + -+ dev = s->key.v.ptrs[block].dev; ++ dev = v->ptrs[block].dev; + + n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); + ret = PTR_ERR_OR_ZERO(n); @@ -45346,7 +45781,7 @@ index 000000000..efbb7cf7a + stripe_ptr = (struct bch_extent_stripe_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, + .block = block, -+ .redundancy = s->key.v.nr_redundant, ++ .redundancy = v->nr_redundant, + .idx = s->key.k.p.offset, + }; + @@ -45364,7 +45799,8 @@ index 000000000..efbb7cf7a + unsigned block) +{ + struct bch_fs *c = trans->c; -+ struct bch_extent_ptr bucket = s->key.v.ptrs[block]; ++ struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; ++ struct bch_extent_ptr bucket = v->ptrs[block]; + struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); + struct bpos bp_pos = POS_MIN; + int ret = 0; @@ -45389,7 +45825,7 @@ index 000000000..efbb7cf7a +static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) +{ + struct btree_trans trans; -+ struct bch_stripe *v = &s->key.v; ++ struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; + unsigned i, nr_data = v->nr_blocks - v->nr_redundant; + int ret = 0; + @@ -45453,7 +45889,7 @@ index 000000000..efbb7cf7a +{ + struct bch_fs *c = s->c; + struct open_bucket *ob; -+ struct bch_stripe *v = &s->new_stripe.key.v; ++ struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + unsigned i, nr_data = v->nr_blocks - v->nr_redundant; + int ret; + @@ -45486,7 +45922,7 @@ index 000000000..efbb7cf7a + } + + for (i = 0; i < nr_data; i++) -+ if (stripe_blockcount_get(&s->existing_stripe.key.v, i)) ++ if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i)) + swap(s->new_stripe.data[i], + s->existing_stripe.data[i]); + @@ -45513,8 +45949,9 @@ index 000000000..efbb7cf7a + ret = bch2_trans_do(c, &s->res, NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL, -+ ec_stripe_key_update(&trans, &s->new_stripe.key, -+ !s->have_existing_stripe)); ++ ec_stripe_key_update(&trans, ++ bkey_i_to_stripe(&s->new_stripe.key), ++ !s->have_existing_stripe)); + if (ret) { + bch_err(c, "error creating stripe: error creating stripe key"); + goto err; @@ -45675,14 +46112,14 @@ index 000000000..efbb7cf7a +} + +static void ec_stripe_key_init(struct bch_fs *c, -+ struct bkey_i_stripe *s, ++ struct bkey_i *k, + unsigned nr_data, + unsigned nr_parity, + unsigned stripe_size) +{ ++ struct bkey_i_stripe *s = bkey_stripe_init(k); + unsigned u64s; + -+ bkey_stripe_init(&s->k_i); + s->v.sectors = cpu_to_le16(stripe_size); + s->v.algorithm = 0; + s->v.nr_blocks = nr_data + nr_parity; @@ -45721,8 +46158,8 @@ index 000000000..efbb7cf7a + BCH_BKEY_PTRS_MAX) - h->redundancy; + s->nr_parity = h->redundancy; + -+ ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data, -+ s->nr_parity, h->blocksize); ++ ec_stripe_key_init(c, &s->new_stripe.key, ++ s->nr_data, s->nr_parity, h->blocksize); + + h->s = s; + return 0; @@ -45825,15 +46262,16 @@ index 000000000..efbb7cf7a + struct bch_devs_mask devs = h->devs; + struct open_bucket *ob; + struct open_buckets buckets; ++ struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; + unsigned i, j, nr_have_parity = 0, nr_have_data = 0; + bool have_cache = true; + int ret = 0; + -+ BUG_ON(h->s->new_stripe.key.v.nr_blocks != h->s->nr_data + h->s->nr_parity); -+ BUG_ON(h->s->new_stripe.key.v.nr_redundant != h->s->nr_parity); ++ BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); ++ BUG_ON(v->nr_redundant != h->s->nr_parity); + -+ for_each_set_bit(i, h->s->blocks_gotten, h->s->new_stripe.key.v.nr_blocks) { -+ __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d); ++ for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { ++ __clear_bit(v->ptrs[i].dev, devs.d); + if (i < h->s->nr_data) + nr_have_data++; + else @@ -45862,7 +46300,7 @@ index 000000000..efbb7cf7a + BUG_ON(j >= h->s->nr_data + h->s->nr_parity); + + h->s->blocks[j] = buckets.v[i]; -+ h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); ++ v->ptrs[j] = bch2_ob_ptr(c, ob); + __set_bit(j, h->s->blocks_gotten); + } + @@ -45888,7 +46326,7 @@ index 000000000..efbb7cf7a + BUG_ON(j >= h->s->nr_data); + + h->s->blocks[j] = buckets.v[i]; -+ h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); ++ v->ptrs[j] = bch2_ob_ptr(c, ob); + __set_bit(j, h->s->blocks_gotten); + } + @@ -45938,6 +46376,8 @@ index 000000000..efbb7cf7a +static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) +{ + struct bch_fs *c = trans->c; ++ struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; ++ struct bch_stripe *existing_v; + unsigned i; + s64 idx; + int ret; @@ -45958,9 +46398,11 @@ index 000000000..efbb7cf7a + return ret; + } + -+ BUG_ON(h->s->existing_stripe.key.v.nr_redundant != h->s->nr_parity); -+ h->s->nr_data = h->s->existing_stripe.key.v.nr_blocks - -+ h->s->existing_stripe.key.v.nr_redundant; ++ existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v; ++ ++ BUG_ON(existing_v->nr_redundant != h->s->nr_parity); ++ h->s->nr_data = existing_v->nr_blocks - ++ existing_v->nr_redundant; + + ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); + if (ret) { @@ -45969,21 +46411,21 @@ index 000000000..efbb7cf7a + } + + BUG_ON(h->s->existing_stripe.size != h->blocksize); -+ BUG_ON(h->s->existing_stripe.size != le16_to_cpu(h->s->existing_stripe.key.v.sectors)); ++ BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); + + /* + * Free buckets we initially allocated - they might conflict with + * blocks from the stripe we're reusing: + */ -+ for_each_set_bit(i, h->s->blocks_gotten, h->s->new_stripe.key.v.nr_blocks) { ++ for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { + bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); + h->s->blocks[i] = 0; + } + memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); + memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); + -+ for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) { -+ if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) { ++ for (i = 0; i < existing_v->nr_blocks; i++) { ++ if (stripe_blockcount_get(existing_v, i)) { + __set_bit(i, h->s->blocks_gotten); + __set_bit(i, h->s->blocks_allocated); + } @@ -45991,7 +46433,7 @@ index 000000000..efbb7cf7a + ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); + } + -+ bkey_copy(&h->s->new_stripe.key.k_i, &h->s->existing_stripe.key.k_i); ++ bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key); + h->s->have_existing_stripe = true; + + return 0; @@ -46160,7 +46602,7 @@ index 000000000..efbb7cf7a + if (!ca) + goto found; + -+ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { ++ for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { + if (!h->s->blocks[i]) + continue; + @@ -46318,7 +46760,7 @@ index 000000000..efbb7cf7a + break; + + if (h->s) { -+ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) ++ for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) + BUG_ON(h->s->blocks[i]); + + kfree(h->s); @@ -46356,10 +46798,10 @@ index 000000000..efbb7cf7a +} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h new file mode 100644 -index 000000000..1b1848e5f +index 000000000..885ae5d51 --- /dev/null +++ b/fs/bcachefs/ec.h -@@ -0,0 +1,263 @@ +@@ -0,0 +1,260 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_H +#define _BCACHEFS_EC_H @@ -46500,10 +46942,7 @@ index 000000000..1b1848e5f + + void *data[BCH_BKEY_PTRS_MAX]; + -+ union { -+ struct bkey_i_stripe key; -+ u64 pad[255]; -+ }; ++ __BKEY_PADDED(key, 255); +}; + +struct ec_stripe_head; @@ -46741,10 +47180,10 @@ index 000000000..dc906fc91 +} diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h new file mode 100644 -index 000000000..735eb2416 +index 000000000..f7fa87442 --- /dev/null +++ b/fs/bcachefs/errcode.h -@@ -0,0 +1,246 @@ +@@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERRCODE_H +#define _BCACHEFS_ERRCODE_H @@ -46960,6 +47399,12 @@ index 000000000..735eb2416 + x(BCH_ERR_invalid_sb, invalid_sb_quota) \ + x(BCH_ERR_invalid, invalid_bkey) \ + x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ ++ x(EIO, btree_node_read_err) \ ++ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ ++ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ ++ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ ++ x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \ ++ x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) + +enum bch_errcode { + BCH_ERR_START = 2048, @@ -47702,10 +48147,10 @@ index 000000000..6f5cf4493 +#endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c new file mode 100644 -index 000000000..c13e0afc6 +index 000000000..1b25f84e4 --- /dev/null +++ b/fs/bcachefs/extents.c -@@ -0,0 +1,1394 @@ +@@ -0,0 +1,1403 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet @@ -48225,13 +48670,13 @@ index 000000000..c13e0afc6 + switch (type) { + case BCH_EXTENT_ENTRY_crc32: + set_common_fields(dst->crc32, src); -+ memcpy(&dst->crc32.csum, &src.csum.lo, sizeof(dst->crc32.csum)); ++ dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); + break; + case BCH_EXTENT_ENTRY_crc64: + set_common_fields(dst->crc64, src); + dst->crc64.nonce = src.nonce; -+ dst->crc64.csum_lo = src.csum.lo; -+ dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); ++ dst->crc64.csum_lo = (u64 __force) src.csum.lo; ++ dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); + break; + case BCH_EXTENT_ENTRY_crc128: + set_common_fields(dst->crc128, src); @@ -48623,11 +49068,11 @@ index 000000000..c13e0afc6 + + bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) -+ if (p1.ptr.dev == p2.ptr.dev && -+ p1.ptr.gen == p2.ptr.gen && -+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == -+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) -+ return true; ++ if (p1.ptr.dev == p2.ptr.dev && ++ p1.ptr.gen == p2.ptr.gen && ++ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == ++ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) ++ return true; + + return false; + } else { @@ -48767,6 +49212,7 @@ index 000000000..c13e0afc6 + +static int extent_ptr_invalid(const struct bch_fs *c, + struct bkey_s_c k, ++ enum bkey_invalid_flags flags, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata, @@ -48779,6 +49225,14 @@ index 000000000..c13e0afc6 + struct bch_dev *ca; + + if (!bch2_dev_exists2(c, ptr->dev)) { ++ /* ++ * If we're in the write path this key might have already been ++ * overwritten, and we could be seeing a device that doesn't ++ * exist anymore due to racing with device removal: ++ */ ++ if (flags & BKEY_INVALID_WRITE) ++ return 0; ++ + prt_printf(err, "pointer to invalid device (%u)", ptr->dev); + return -BCH_ERR_invalid_bkey; + } @@ -48844,8 +49298,8 @@ index 000000000..c13e0afc6 + + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: -+ ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk, -+ false, err); ++ ret = extent_ptr_invalid(c, k, flags, &entry->ptr, ++ size_ondisk, false, err); + if (ret) + return ret; + @@ -49102,7 +49556,7 @@ index 000000000..c13e0afc6 +} diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h new file mode 100644 -index 000000000..d359b3fda +index 000000000..7ee8d031b --- /dev/null +++ b/fs/bcachefs/extents.h @@ -0,0 +1,757 @@ @@ -49263,7 +49717,7 @@ index 000000000..d359b3fda + common_fields(crc->crc32), + }; + -+ memcpy(&ret.csum.lo, &crc->crc32.csum, sizeof(crc->crc32.csum)); ++ *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum; + return ret; + } + case BCH_EXTENT_ENTRY_crc64: { @@ -49273,8 +49727,8 @@ index 000000000..d359b3fda + .csum.lo = (__force __le64) crc->crc64.csum_lo, + }; + -+ u16 hi = crc->crc64.csum_hi; -+ memcpy(&ret.csum.hi, &hi, sizeof(hi)); ++ *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi; ++ + return ret; + } + case BCH_EXTENT_ENTRY_crc128: { @@ -49796,7 +50250,7 @@ index 000000000..d359b3fda +void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, -+ unsigned, struct printbuf *); ++ enum bkey_invalid_flags, struct printbuf *); + +void bch2_ptr_swab(struct bkey_s); + @@ -50885,221 +51339,28 @@ index 000000000..dde237859 + struct bch_inode_unpacked *); + +#endif /* _BCACHEFS_FS_COMMON_H */ -diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c new file mode 100644 -index 000000000..6b691b2b5 +index 000000000..dc22182d5 --- /dev/null -+++ b/fs/bcachefs/fs-io.c -@@ -0,0 +1,3982 @@ ++++ b/fs/bcachefs/fs-io-buffered.c +@@ -0,0 +1,1099 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "clock.h" -+#include "error.h" -+#include "extents.h" -+#include "extent_update.h" -+#include "fs.h" +#include "fs-io.h" -+#include "fsck.h" -+#include "inode.h" -+#include "journal.h" ++#include "fs-io-buffered.h" ++#include "fs-io-direct.h" ++#include "fs-io-pagecache.h" +#include "io.h" -+#include "keylist.h" -+#include "quota.h" -+#include "reflink.h" -+#include "trace.h" + -+#include +#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include ++#include +#include + -+#include -+ -+static void bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned); -+ -+struct folio_vec { -+ struct folio *fv_folio; -+ size_t fv_offset; -+ size_t fv_len; -+}; -+ -+static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) -+{ -+ -+ struct folio *folio = page_folio(bv.bv_page); -+ size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + -+ bv.bv_offset; -+ size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); -+ -+ return (struct folio_vec) { -+ .fv_folio = folio, -+ .fv_offset = offset, -+ .fv_len = len, -+ }; -+} -+ -+static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, -+ struct bvec_iter iter) -+{ -+ return biovec_to_foliovec(bio_iter_iovec(bio, iter)); -+} -+ -+#define __bio_for_each_folio(bvl, bio, iter, start) \ -+ for (iter = (start); \ -+ (iter).bi_size && \ -+ ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ -+ bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) -+ -+/** -+ * bio_for_each_folio - iterate over folios within a bio -+ * -+ * Like other non-_all versions, this iterates over what bio->bi_iter currently -+ * points to. This version is for drivers, where the bio may have previously -+ * been split or cloned. -+ */ -+#define bio_for_each_folio(bvl, bio, iter) \ -+ __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) -+ -+/* -+ * Use u64 for the end pos and sector helpers because if the folio covers the -+ * max supported range of the mapping, the start offset of the next folio -+ * overflows loff_t. This breaks much of the range based processing in the -+ * buffered write path. -+ */ -+static inline u64 folio_end_pos(struct folio *folio) -+{ -+ return folio_pos(folio) + folio_size(folio); -+} -+ -+static inline size_t folio_sectors(struct folio *folio) -+{ -+ return PAGE_SECTORS << folio_order(folio); -+} -+ -+static inline loff_t folio_sector(struct folio *folio) -+{ -+ return folio_pos(folio) >> 9; -+} -+ -+static inline u64 folio_end_sector(struct folio *folio) -+{ -+ return folio_end_pos(folio) >> 9; -+} -+ -+typedef DARRAY(struct folio *) folios; -+ -+static int filemap_get_contig_folios_d(struct address_space *mapping, -+ loff_t start, u64 end, -+ int fgp_flags, gfp_t gfp, -+ folios *folios) -+{ -+ struct folio *f; -+ u64 pos = start; -+ int ret = 0; -+ -+ while (pos < end) { -+ if ((u64) pos >= (u64) start + (1ULL << 20)) -+ fgp_flags &= ~FGP_CREAT; -+ -+ ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL); -+ if (ret) -+ break; -+ -+ f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); -+ if (IS_ERR_OR_NULL(f)) -+ break; -+ -+ BUG_ON(folios->nr && folio_pos(f) != pos); -+ -+ pos = folio_end_pos(f); -+ darray_push(folios, f); -+ } -+ -+ if (!folios->nr && !ret && (fgp_flags & FGP_CREAT)) -+ ret = -ENOMEM; -+ -+ return folios->nr ? 0 : ret; -+} -+ -+struct nocow_flush { -+ struct closure *cl; -+ struct bch_dev *ca; -+ struct bio bio; -+}; -+ -+static void nocow_flush_endio(struct bio *_bio) -+{ -+ -+ struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); -+ -+ closure_put(bio->cl); -+ percpu_ref_put(&bio->ca->io_ref); -+ bio_put(&bio->bio); -+} -+ -+static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct closure *cl) -+{ -+ struct nocow_flush *bio; -+ struct bch_dev *ca; -+ struct bch_devs_mask devs; -+ unsigned dev; -+ -+ dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); -+ if (dev == BCH_SB_MEMBERS_MAX) -+ return; -+ -+ devs = inode->ei_devs_need_flush; -+ memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); -+ -+ for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { -+ rcu_read_lock(); -+ ca = rcu_dereference(c->devs[dev]); -+ if (ca && !percpu_ref_tryget(&ca->io_ref)) -+ ca = NULL; -+ rcu_read_unlock(); -+ -+ if (!ca) -+ continue; -+ -+ bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, -+ REQ_OP_FLUSH, -+ GFP_KERNEL, -+ &c->nocow_flush_bioset), -+ struct nocow_flush, bio); -+ bio->cl = cl; -+ bio->ca = ca; -+ bio->bio.bi_end_io = nocow_flush_endio; -+ closure_bio_submit(&bio->bio, cl); -+ } -+} -+ -+static int bch2_inode_flush_nocow_writes(struct bch_fs *c, -+ struct bch_inode_info *inode) -+{ -+ struct closure cl; -+ -+ closure_init_stack(&cl); -+ bch2_inode_flush_nocow_writes_async(c, inode, &cl); -+ closure_sync(&cl); -+ -+ return 0; -+} -+ +static inline bool bio_full(struct bio *bio, unsigned len) +{ + if (bio->bi_vcnt >= bio->bi_max_vecs) @@ -51109,893 +51370,6 @@ index 000000000..6b691b2b5 + return false; +} + -+static inline struct address_space *faults_disabled_mapping(void) -+{ -+ return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); -+} -+ -+static inline void set_fdm_dropped_locks(void) -+{ -+ current->faults_disabled_mapping = -+ (void *) (((unsigned long) current->faults_disabled_mapping)|1); -+} -+ -+static inline bool fdm_dropped_locks(void) -+{ -+ return ((unsigned long) current->faults_disabled_mapping) & 1; -+} -+ -+struct quota_res { -+ u64 sectors; -+}; -+ -+struct bch_writepage_io { -+ struct bch_inode_info *inode; -+ -+ /* must be last: */ -+ struct bch_write_op op; -+}; -+ -+struct dio_write { -+ struct kiocb *req; -+ struct address_space *mapping; -+ struct bch_inode_info *inode; -+ struct mm_struct *mm; -+ unsigned loop:1, -+ extending:1, -+ sync:1, -+ flush:1, -+ free_iov:1; -+ struct quota_res quota_res; -+ u64 written; -+ -+ struct iov_iter iter; -+ struct iovec inline_vecs[2]; -+ -+ /* must be last: */ -+ struct bch_write_op op; -+}; -+ -+struct dio_read { -+ struct closure cl; -+ struct kiocb *req; -+ long ret; -+ bool should_dirty; -+ struct bch_read_bio rbio; -+}; -+ -+/* pagecache_block must be held */ -+static noinline int write_invalidate_inode_pages_range(struct address_space *mapping, -+ loff_t start, loff_t end) -+{ -+ int ret; -+ -+ /* -+ * XXX: the way this is currently implemented, we can spin if a process -+ * is continually redirtying a specific page -+ */ -+ do { -+ if (!mapping->nrpages) -+ return 0; -+ -+ ret = filemap_write_and_wait_range(mapping, start, end); -+ if (ret) -+ break; -+ -+ if (!mapping->nrpages) -+ return 0; -+ -+ ret = invalidate_inode_pages2_range(mapping, -+ start >> PAGE_SHIFT, -+ end >> PAGE_SHIFT); -+ } while (ret == -EBUSY); -+ -+ return ret; -+} -+ -+/* quotas */ -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ -+static void __bch2_quota_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res) -+{ -+ BUG_ON(res->sectors > inode->ei_quota_reserved); -+ -+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -+ -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); -+ inode->ei_quota_reserved -= res->sectors; -+ res->sectors = 0; -+} -+ -+static void bch2_quota_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res) -+{ -+ if (res->sectors) { -+ mutex_lock(&inode->ei_quota_lock); -+ __bch2_quota_reservation_put(c, inode, res); -+ mutex_unlock(&inode->ei_quota_lock); -+ } -+} -+ -+static int bch2_quota_reservation_add(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res, -+ u64 sectors, -+ bool check_enospc) -+{ -+ int ret; -+ -+ if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) -+ return 0; -+ -+ mutex_lock(&inode->ei_quota_lock); -+ ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, -+ check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); -+ if (likely(!ret)) { -+ inode->ei_quota_reserved += sectors; -+ res->sectors += sectors; -+ } -+ mutex_unlock(&inode->ei_quota_lock); -+ -+ return ret; -+} -+ -+#else -+ -+static void __bch2_quota_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res) {} -+ -+static void bch2_quota_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res) {} -+ -+static int bch2_quota_reservation_add(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res, -+ unsigned sectors, -+ bool check_enospc) -+{ -+ return 0; -+} -+ -+#endif -+ -+/* i_size updates: */ -+ -+struct inode_new_size { -+ loff_t new_size; -+ u64 now; -+ unsigned fields; -+}; -+ -+static int inode_set_size(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct inode_new_size *s = p; -+ -+ bi->bi_size = s->new_size; -+ if (s->fields & ATTR_ATIME) -+ bi->bi_atime = s->now; -+ if (s->fields & ATTR_MTIME) -+ bi->bi_mtime = s->now; -+ if (s->fields & ATTR_CTIME) -+ bi->bi_ctime = s->now; -+ -+ return 0; -+} -+ -+int __must_check bch2_write_inode_size(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ loff_t new_size, unsigned fields) -+{ -+ struct inode_new_size s = { -+ .new_size = new_size, -+ .now = bch2_current_time(c), -+ .fields = fields, -+ }; -+ -+ return bch2_write_inode(c, inode, inode_set_size, &s, fields); -+} -+ -+static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, -+ struct quota_res *quota_res, s64 sectors) -+{ -+ bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, -+ "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", -+ inode->v.i_ino, (u64) inode->v.i_blocks, sectors, -+ inode->ei_inode.bi_sectors); -+ inode->v.i_blocks += sectors; -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ if (quota_res && -+ !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && -+ sectors > 0) { -+ BUG_ON(sectors > quota_res->sectors); -+ BUG_ON(sectors > inode->ei_quota_reserved); -+ -+ quota_res->sectors -= sectors; -+ inode->ei_quota_reserved -= sectors; -+ } else { -+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); -+ } -+#endif -+} -+ -+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, -+ struct quota_res *quota_res, s64 sectors) -+{ -+ if (sectors) { -+ mutex_lock(&inode->ei_quota_lock); -+ __i_sectors_acct(c, inode, quota_res, sectors); -+ mutex_unlock(&inode->ei_quota_lock); -+ } -+} -+ -+/* page state: */ -+ -+/* stored in page->private: */ -+ -+#define BCH_FOLIO_SECTOR_STATE() \ -+ x(unallocated) \ -+ x(reserved) \ -+ x(dirty) \ -+ x(dirty_reserved) \ -+ x(allocated) -+ -+enum bch_folio_sector_state { -+#define x(n) SECTOR_##n, -+ BCH_FOLIO_SECTOR_STATE() -+#undef x -+}; -+ -+static const char * const bch2_folio_sector_states[] = { -+#define x(n) #n, -+ BCH_FOLIO_SECTOR_STATE() -+#undef x -+ NULL -+}; -+ -+static inline enum bch_folio_sector_state -+folio_sector_dirty(enum bch_folio_sector_state state) -+{ -+ switch (state) { -+ case SECTOR_unallocated: -+ return SECTOR_dirty; -+ case SECTOR_reserved: -+ return SECTOR_dirty_reserved; -+ default: -+ return state; -+ } -+} -+ -+static inline enum bch_folio_sector_state -+folio_sector_undirty(enum bch_folio_sector_state state) -+{ -+ switch (state) { -+ case SECTOR_dirty: -+ return SECTOR_unallocated; -+ case SECTOR_dirty_reserved: -+ return SECTOR_reserved; -+ default: -+ return state; -+ } -+} -+ -+static inline enum bch_folio_sector_state -+folio_sector_reserve(enum bch_folio_sector_state state) -+{ -+ switch (state) { -+ case SECTOR_unallocated: -+ return SECTOR_reserved; -+ case SECTOR_dirty: -+ return SECTOR_dirty_reserved; -+ default: -+ return state; -+ } -+} -+ -+struct bch_folio_sector { -+ /* Uncompressed, fully allocated replicas (or on disk reservation): */ -+ unsigned nr_replicas:4; -+ -+ /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ -+ unsigned replicas_reserved:4; -+ -+ /* i_sectors: */ -+ enum bch_folio_sector_state state:8; -+}; -+ -+struct bch_folio { -+ spinlock_t lock; -+ atomic_t write_count; -+ /* -+ * Is the sector state up to date with the btree? -+ * (Not the data itself) -+ */ -+ bool uptodate; -+ struct bch_folio_sector s[]; -+}; -+ -+static inline void folio_sector_set(struct folio *folio, -+ struct bch_folio *s, -+ unsigned i, unsigned n) -+{ -+ s->s[i].state = n; -+} -+ -+/* file offset (to folio offset) to bch_folio_sector index */ -+static inline int folio_pos_to_s(struct folio *folio, loff_t pos) -+{ -+ u64 f_offset = pos - folio_pos(folio); -+ BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); -+ return f_offset >> SECTOR_SHIFT; -+} -+ -+static inline struct bch_folio *__bch2_folio(struct folio *folio) -+{ -+ return folio_has_private(folio) -+ ? (struct bch_folio *) folio_get_private(folio) -+ : NULL; -+} -+ -+static inline struct bch_folio *bch2_folio(struct folio *folio) -+{ -+ EBUG_ON(!folio_test_locked(folio)); -+ -+ return __bch2_folio(folio); -+} -+ -+/* for newly allocated folios: */ -+static void __bch2_folio_release(struct folio *folio) -+{ -+ kfree(folio_detach_private(folio)); -+} -+ -+static void bch2_folio_release(struct folio *folio) -+{ -+ EBUG_ON(!folio_test_locked(folio)); -+ __bch2_folio_release(folio); -+} -+ -+/* for newly allocated folios: */ -+static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) -+{ -+ struct bch_folio *s; -+ -+ s = kzalloc(sizeof(*s) + -+ sizeof(struct bch_folio_sector) * -+ folio_sectors(folio), gfp); -+ if (!s) -+ return NULL; -+ -+ spin_lock_init(&s->lock); -+ folio_attach_private(folio, s); -+ return s; -+} -+ -+static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) -+{ -+ return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); -+} -+ -+static unsigned bkey_to_sector_state(struct bkey_s_c k) -+{ -+ if (bkey_extent_is_reservation(k)) -+ return SECTOR_reserved; -+ if (bkey_extent_is_allocation(k.k)) -+ return SECTOR_allocated; -+ return SECTOR_unallocated; -+} -+ -+static void __bch2_folio_set(struct folio *folio, -+ unsigned pg_offset, unsigned pg_len, -+ unsigned nr_ptrs, unsigned state) -+{ -+ struct bch_folio *s = bch2_folio(folio); -+ unsigned i, sectors = folio_sectors(folio); -+ -+ BUG_ON(pg_offset >= sectors); -+ BUG_ON(pg_offset + pg_len > sectors); -+ -+ spin_lock(&s->lock); -+ -+ for (i = pg_offset; i < pg_offset + pg_len; i++) { -+ s->s[i].nr_replicas = nr_ptrs; -+ folio_sector_set(folio, s, i, state); -+ } -+ -+ if (i == sectors) -+ s->uptodate = true; -+ -+ spin_unlock(&s->lock); -+} -+ -+/* -+ * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the -+ * extents btree: -+ */ -+static int bch2_folio_set(struct bch_fs *c, subvol_inum inum, -+ struct folio **folios, unsigned nr_folios) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bch_folio *s; -+ u64 offset = folio_sector(folios[0]); -+ unsigned folio_idx; -+ u32 snapshot; -+ bool need_set = false; -+ int ret; -+ -+ for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { -+ s = bch2_folio_create(folios[folio_idx], GFP_KERNEL); -+ if (!s) -+ return -ENOMEM; -+ -+ need_set |= !s->uptodate; -+ } -+ -+ if (!need_set) -+ return 0; -+ -+ folio_idx = 0; -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, -+ SPOS(inum.inum, offset, snapshot), -+ BTREE_ITER_SLOTS, k, ret) { -+ unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); -+ unsigned state = bkey_to_sector_state(k); -+ -+ while (folio_idx < nr_folios) { -+ struct folio *folio = folios[folio_idx]; -+ u64 folio_start = folio_sector(folio); -+ u64 folio_end = folio_end_sector(folio); -+ unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start; -+ unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start; -+ -+ BUG_ON(k.k->p.offset < folio_start); -+ BUG_ON(bkey_start_offset(k.k) > folio_end); -+ -+ if (!bch2_folio(folio)->uptodate) -+ __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); -+ -+ if (k.k->p.offset < folio_end) -+ break; -+ folio_idx++; -+ } -+ -+ if (folio_idx == nr_folios) -+ break; -+ } -+ -+ offset = iter.pos.offset; -+ bch2_trans_iter_exit(&trans, &iter); -+err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) -+{ -+ struct bvec_iter iter; -+ struct folio_vec fv; -+ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v -+ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); -+ unsigned state = bkey_to_sector_state(k); -+ -+ bio_for_each_folio(fv, bio, iter) -+ __bch2_folio_set(fv.fv_folio, -+ fv.fv_offset >> 9, -+ fv.fv_len >> 9, -+ nr_ptrs, state); -+} -+ -+static void mark_pagecache_unallocated(struct bch_inode_info *inode, -+ u64 start, u64 end) -+{ -+ pgoff_t index = start >> PAGE_SECTORS_SHIFT; -+ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; -+ struct folio_batch fbatch; -+ unsigned i, j; -+ -+ if (end <= start) -+ return; -+ -+ folio_batch_init(&fbatch); -+ -+ while (filemap_get_folios(inode->v.i_mapping, -+ &index, end_index, &fbatch)) { -+ for (i = 0; i < folio_batch_count(&fbatch); i++) { -+ struct folio *folio = fbatch.folios[i]; -+ u64 folio_start = folio_sector(folio); -+ u64 folio_end = folio_end_sector(folio); -+ unsigned folio_offset = max(start, folio_start) - folio_start; -+ unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; -+ struct bch_folio *s; -+ -+ BUG_ON(end <= folio_start); -+ -+ folio_lock(folio); -+ s = bch2_folio(folio); -+ -+ if (s) { -+ spin_lock(&s->lock); -+ for (j = folio_offset; j < folio_offset + folio_len; j++) -+ s->s[j].nr_replicas = 0; -+ spin_unlock(&s->lock); -+ } -+ -+ folio_unlock(folio); -+ } -+ folio_batch_release(&fbatch); -+ cond_resched(); -+ } -+} -+ -+static void mark_pagecache_reserved(struct bch_inode_info *inode, -+ u64 start, u64 end) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ pgoff_t index = start >> PAGE_SECTORS_SHIFT; -+ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; -+ struct folio_batch fbatch; -+ s64 i_sectors_delta = 0; -+ unsigned i, j; -+ -+ if (end <= start) -+ return; -+ -+ folio_batch_init(&fbatch); -+ -+ while (filemap_get_folios(inode->v.i_mapping, -+ &index, end_index, &fbatch)) { -+ for (i = 0; i < folio_batch_count(&fbatch); i++) { -+ struct folio *folio = fbatch.folios[i]; -+ u64 folio_start = folio_sector(folio); -+ u64 folio_end = folio_end_sector(folio); -+ unsigned folio_offset = max(start, folio_start) - folio_start; -+ unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; -+ struct bch_folio *s; -+ -+ BUG_ON(end <= folio_start); -+ -+ folio_lock(folio); -+ s = bch2_folio(folio); -+ -+ if (s) { -+ spin_lock(&s->lock); -+ for (j = folio_offset; j < folio_offset + folio_len; j++) { -+ i_sectors_delta -= s->s[j].state == SECTOR_dirty; -+ folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state)); -+ } -+ spin_unlock(&s->lock); -+ } -+ -+ folio_unlock(folio); -+ } -+ folio_batch_release(&fbatch); -+ cond_resched(); -+ } -+ -+ i_sectors_acct(c, inode, NULL, i_sectors_delta); -+} -+ -+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) -+{ -+ /* XXX: this should not be open coded */ -+ return inode->ei_inode.bi_data_replicas -+ ? inode->ei_inode.bi_data_replicas - 1 -+ : c->opts.data_replicas; -+} -+ -+static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, -+ unsigned nr_replicas) -+{ -+ return max(0, (int) nr_replicas - -+ s->nr_replicas - -+ s->replicas_reserved); -+} -+ -+static int bch2_get_folio_disk_reservation(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct folio *folio, bool check_enospc) -+{ -+ struct bch_folio *s = bch2_folio_create(folio, 0); -+ unsigned nr_replicas = inode_nr_replicas(c, inode); -+ struct disk_reservation disk_res = { 0 }; -+ unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; -+ int ret; -+ -+ if (!s) -+ return -ENOMEM; -+ -+ for (i = 0; i < sectors; i++) -+ disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); -+ -+ if (!disk_res_sectors) -+ return 0; -+ -+ ret = bch2_disk_reservation_get(c, &disk_res, -+ disk_res_sectors, 1, -+ !check_enospc -+ ? BCH_DISK_RESERVATION_NOFAIL -+ : 0); -+ if (unlikely(ret)) -+ return ret; -+ -+ for (i = 0; i < sectors; i++) -+ s->s[i].replicas_reserved += -+ sectors_to_reserve(&s->s[i], nr_replicas); -+ -+ return 0; -+} -+ -+struct bch2_folio_reservation { -+ struct disk_reservation disk; -+ struct quota_res quota; -+}; -+ -+static void bch2_folio_reservation_init(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch2_folio_reservation *res) -+{ -+ memset(res, 0, sizeof(*res)); -+ -+ res->disk.nr_replicas = inode_nr_replicas(c, inode); -+} -+ -+static void bch2_folio_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch2_folio_reservation *res) -+{ -+ bch2_disk_reservation_put(c, &res->disk); -+ bch2_quota_reservation_put(c, inode, &res->quota); -+} -+ -+static int bch2_folio_reservation_get(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct folio *folio, -+ struct bch2_folio_reservation *res, -+ unsigned offset, unsigned len) -+{ -+ struct bch_folio *s = bch2_folio_create(folio, 0); -+ unsigned i, disk_sectors = 0, quota_sectors = 0; -+ int ret; -+ -+ if (!s) -+ return -ENOMEM; -+ -+ BUG_ON(!s->uptodate); -+ -+ for (i = round_down(offset, block_bytes(c)) >> 9; -+ i < round_up(offset + len, block_bytes(c)) >> 9; -+ i++) { -+ disk_sectors += sectors_to_reserve(&s->s[i], -+ res->disk.nr_replicas); -+ quota_sectors += s->s[i].state == SECTOR_unallocated; -+ } -+ -+ if (disk_sectors) { -+ ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); -+ if (unlikely(ret)) -+ return ret; -+ } -+ -+ if (quota_sectors) { -+ ret = bch2_quota_reservation_add(c, inode, &res->quota, -+ quota_sectors, true); -+ if (unlikely(ret)) { -+ struct disk_reservation tmp = { -+ .sectors = disk_sectors -+ }; -+ -+ bch2_disk_reservation_put(c, &tmp); -+ res->disk.sectors -= disk_sectors; -+ return ret; -+ } -+ } -+ -+ return 0; -+} -+ -+static void bch2_clear_folio_bits(struct folio *folio) -+{ -+ struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_folio *s = bch2_folio(folio); -+ struct disk_reservation disk_res = { 0 }; -+ int i, sectors = folio_sectors(folio), dirty_sectors = 0; -+ -+ if (!s) -+ return; -+ -+ EBUG_ON(!folio_test_locked(folio)); -+ EBUG_ON(folio_test_writeback(folio)); -+ -+ for (i = 0; i < sectors; i++) { -+ disk_res.sectors += s->s[i].replicas_reserved; -+ s->s[i].replicas_reserved = 0; -+ -+ dirty_sectors -= s->s[i].state == SECTOR_dirty; -+ folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); -+ } -+ -+ bch2_disk_reservation_put(c, &disk_res); -+ -+ i_sectors_acct(c, inode, NULL, dirty_sectors); -+ -+ bch2_folio_release(folio); -+} -+ -+static void bch2_set_folio_dirty(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct folio *folio, -+ struct bch2_folio_reservation *res, -+ unsigned offset, unsigned len) -+{ -+ struct bch_folio *s = bch2_folio(folio); -+ unsigned i, dirty_sectors = 0; -+ -+ WARN_ON((u64) folio_pos(folio) + offset + len > -+ round_up((u64) i_size_read(&inode->v), block_bytes(c))); -+ -+ BUG_ON(!s->uptodate); -+ -+ spin_lock(&s->lock); -+ -+ for (i = round_down(offset, block_bytes(c)) >> 9; -+ i < round_up(offset + len, block_bytes(c)) >> 9; -+ i++) { -+ unsigned sectors = sectors_to_reserve(&s->s[i], -+ res->disk.nr_replicas); -+ -+ /* -+ * This can happen if we race with the error path in -+ * bch2_writepage_io_done(): -+ */ -+ sectors = min_t(unsigned, sectors, res->disk.sectors); -+ -+ s->s[i].replicas_reserved += sectors; -+ res->disk.sectors -= sectors; -+ -+ dirty_sectors += s->s[i].state == SECTOR_unallocated; -+ -+ folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); -+ } -+ -+ spin_unlock(&s->lock); -+ -+ i_sectors_acct(c, inode, &res->quota, dirty_sectors); -+ -+ if (!folio_test_dirty(folio)) -+ filemap_dirty_folio(inode->v.i_mapping, folio); -+} -+ -+vm_fault_t bch2_page_fault(struct vm_fault *vmf) -+{ -+ struct file *file = vmf->vma->vm_file; -+ struct address_space *mapping = file->f_mapping; -+ struct address_space *fdm = faults_disabled_mapping(); -+ struct bch_inode_info *inode = file_bch_inode(file); -+ vm_fault_t ret; -+ -+ if (fdm == mapping) -+ return VM_FAULT_SIGBUS; -+ -+ /* Lock ordering: */ -+ if (fdm > mapping) { -+ struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); -+ -+ if (bch2_pagecache_add_tryget(inode)) -+ goto got_lock; -+ -+ bch2_pagecache_block_put(fdm_host); -+ -+ bch2_pagecache_add_get(inode); -+ bch2_pagecache_add_put(inode); -+ -+ bch2_pagecache_block_get(fdm_host); -+ -+ /* Signal that lock has been dropped: */ -+ set_fdm_dropped_locks(); -+ return VM_FAULT_SIGBUS; -+ } -+ -+ bch2_pagecache_add_get(inode); -+got_lock: -+ ret = filemap_fault(vmf); -+ bch2_pagecache_add_put(inode); -+ -+ return ret; -+} -+ -+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) -+{ -+ struct folio *folio = page_folio(vmf->page); -+ struct file *file = vmf->vma->vm_file; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct address_space *mapping = file->f_mapping; -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch2_folio_reservation res; -+ unsigned len; -+ loff_t isize; -+ vm_fault_t ret; -+ -+ bch2_folio_reservation_init(c, inode, &res); -+ -+ sb_start_pagefault(inode->v.i_sb); -+ file_update_time(file); -+ -+ /* -+ * Not strictly necessary, but helps avoid dio writes livelocking in -+ * write_invalidate_inode_pages_range() - can drop this if/when we get -+ * a write_invalidate_inode_pages_range() that works without dropping -+ * page lock before invalidating page -+ */ -+ bch2_pagecache_add_get(inode); -+ -+ folio_lock(folio); -+ isize = i_size_read(&inode->v); -+ -+ if (folio->mapping != mapping || folio_pos(folio) >= isize) { -+ folio_unlock(folio); -+ ret = VM_FAULT_NOPAGE; -+ goto out; -+ } -+ -+ len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); -+ -+ if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: -+ bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { -+ folio_unlock(folio); -+ ret = VM_FAULT_SIGBUS; -+ goto out; -+ } -+ -+ bch2_set_folio_dirty(c, inode, folio, &res, 0, len); -+ bch2_folio_reservation_put(c, inode, &res); -+ -+ folio_wait_stable(folio); -+ ret = VM_FAULT_LOCKED; -+out: -+ bch2_pagecache_add_put(inode); -+ sb_end_pagefault(inode->v.i_sb); -+ -+ return ret; -+} -+ -+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) -+{ -+ if (offset || length < folio_size(folio)) -+ return; -+ -+ bch2_clear_folio_bits(folio); -+} -+ -+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) -+{ -+ if (folio_test_dirty(folio) || folio_test_writeback(folio)) -+ return false; -+ -+ bch2_clear_folio_bits(folio); -+ return true; -+} -+ +/* readpage(s): */ + +static void bch2_readpages_end_io(struct bio *bio) @@ -52031,7 +51405,7 @@ index 000000000..6b691b2b5 + + iter->mapping = ractl->mapping; + -+ ret = filemap_get_contig_folios_d(iter->mapping, ++ ret = bch2_filemap_get_contig_folios_d(iter->mapping, + ractl->_index << PAGE_SHIFT, + (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, + 0, mapping_gfp_mask(iter->mapping), @@ -52304,8 +51678,7 @@ index 000000000..6b691b2b5 + complete(bio->bi_private); +} + -+static int bch2_read_single_folio(struct folio *folio, -+ struct address_space *mapping) ++int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) +{ + struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -52345,6 +51718,13 @@ index 000000000..6b691b2b5 + +/* writepages: */ + ++struct bch_writepage_io { ++ struct bch_inode_info *inode; ++ ++ /* must be last: */ ++ struct bch_write_op op; ++}; ++ +struct bch_writepage_state { + struct bch_writepage_io *io; + struct bch_io_opts opts; @@ -52416,7 +51796,7 @@ index 000000000..6b691b2b5 + * PageWriteback is effectively our ref on the inode - fixup i_blocks + * before calling end_page_writeback: + */ -+ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); ++ bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); + + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s = __bch2_folio(fi.folio); @@ -52510,8 +51890,7 @@ index 000000000..6b691b2b5 + + if (f_sectors > w->tmp_sectors) { + kfree(w->tmp); -+ w->tmp = kzalloc(sizeof(struct bch_folio_sector) * -+ f_sectors, __GFP_NOFAIL); ++ w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL); + w->tmp_sectors = f_sectors; + } + @@ -52543,7 +51922,7 @@ index 000000000..6b691b2b5 + ? 0 : nr_replicas_this_write; + + s->s[i].replicas_reserved = 0; -+ folio_sector_set(folio, s, i, SECTOR_allocated); ++ bch2_folio_sector_set(folio, s, i, SECTOR_allocated); + } + spin_unlock(&s->lock); + @@ -52797,7 +52176,7 @@ index 000000000..6b691b2b5 + bch2_folio_reservation_init(c, inode, &res); + darray_init(&folios); + -+ ret = filemap_get_contig_folios_d(mapping, pos, end, ++ ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, + FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, + mapping_gfp_mask(mapping), + &folios); @@ -52877,6 +52256,7 @@ index 000000000..6b691b2b5 + if (!folio_test_uptodate(f) && + f_copied != folio_size(f) && + pos + copied + f_copied < inode->v.i_size) { ++ iov_iter_revert(iter, f_copied); + folio_zero_range(f, 0, folio_size(f)); + folios_trunc(&folios, fi); + break; @@ -53011,8 +52391,123 @@ index 000000000..6b691b2b5 + return written ? written : ret; +} + ++ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct file *file = iocb->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ ssize_t ret; ++ ++ if (iocb->ki_flags & IOCB_DIRECT) { ++ ret = bch2_direct_write(iocb, from); ++ goto out; ++ } ++ ++ inode_lock(&inode->v); ++ ++ ret = generic_write_checks(iocb, from); ++ if (ret <= 0) ++ goto unlock; ++ ++ ret = file_remove_privs(file); ++ if (ret) ++ goto unlock; ++ ++ ret = file_update_time(file); ++ if (ret) ++ goto unlock; ++ ++ ret = bch2_buffered_write(iocb, from); ++ if (likely(ret > 0)) ++ iocb->ki_pos += ret; ++unlock: ++ inode_unlock(&inode->v); ++ ++ if (ret > 0) ++ ret = generic_write_sync(iocb, ret); ++out: ++ return bch2_err_class(ret); ++} ++ ++void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) ++{ ++ bioset_exit(&c->writepage_bioset); ++} ++ ++int bch2_fs_fs_io_buffered_init(struct bch_fs *c) ++{ ++ if (bioset_init(&c->writepage_bioset, ++ 4, offsetof(struct bch_writepage_io, op.wbio.bio), ++ BIOSET_NEED_BVECS)) ++ return -BCH_ERR_ENOMEM_writepage_bioset_init; ++ ++ return 0; ++} ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h +new file mode 100644 +index 000000000..a6126ff79 +--- /dev/null ++++ b/fs/bcachefs/fs-io-buffered.h +@@ -0,0 +1,27 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_IO_BUFFERED_H ++#define _BCACHEFS_FS_IO_BUFFERED_H ++ ++#ifndef NO_BCACHEFS_FS ++ ++int bch2_read_single_folio(struct folio *, struct address_space *); ++int bch2_read_folio(struct file *, struct folio *); ++ ++int bch2_writepages(struct address_space *, struct writeback_control *); ++void bch2_readahead(struct readahead_control *); ++ ++int bch2_write_begin(struct file *, struct address_space *, loff_t, ++ unsigned, struct page **, void **); ++int bch2_write_end(struct file *, struct address_space *, loff_t, ++ unsigned, unsigned, struct page *, void *); ++ ++ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); ++ ++void bch2_fs_fs_io_buffered_exit(struct bch_fs *); ++int bch2_fs_fs_io_buffered_init(struct bch_fs *); ++#else ++static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {} ++static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; } ++#endif ++ ++#endif /* _BCACHEFS_FS_IO_BUFFERED_H */ +diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c +new file mode 100644 +index 000000000..2b29abd24 +--- /dev/null ++++ b/fs/bcachefs/fs-io-direct.c +@@ -0,0 +1,679 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "fs.h" ++#include "fs-io.h" ++#include "fs-io-direct.h" ++#include "fs-io-pagecache.h" ++#include "io.h" ++ ++#include ++#include ++#include ++ +/* O_DIRECT reads */ + ++struct dio_read { ++ struct closure cl; ++ struct kiocb *req; ++ long ret; ++ bool should_dirty; ++ struct bch_read_bio rbio; ++}; ++ +static void bio_check_or_release(struct bio *bio, bool check_dirty) +{ + if (check_dirty) { @@ -53198,6 +52693,26 @@ index 000000000..6b691b2b5 + +/* O_DIRECT writes */ + ++struct dio_write { ++ struct kiocb *req; ++ struct address_space *mapping; ++ struct bch_inode_info *inode; ++ struct mm_struct *mm; ++ unsigned loop:1, ++ extending:1, ++ sync:1, ++ flush:1, ++ free_iov:1; ++ struct quota_res quota_res; ++ u64 written; ++ ++ struct iov_iter iter; ++ struct iovec inline_vecs[2]; ++ ++ /* must be last: */ ++ struct bch_write_op op; ++}; ++ +static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, + u64 offset, u64 size, + unsigned nr_replicas, bool compressed) @@ -53321,7 +52836,8 @@ index 000000000..6b691b2b5 + if (ret) { + dio->op.error = ret; + } else { -+ bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl); ++ bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, ++ &dio->op.cl); + bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); + } + } @@ -53387,7 +52903,7 @@ index 000000000..6b691b2b5 + + if (dio->op.i_sectors_delta || dio->quota_res.sectors) { + mutex_lock(&inode->ei_quota_lock); -+ __i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); ++ __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); + __bch2_quota_reservation_put(c, inode, &dio->quota_res); + mutex_unlock(&inode->ei_quota_lock); + } @@ -53436,7 +52952,7 @@ index 000000000..6b691b2b5 + goto err; + + if (unlikely(dropped_locks)) { -+ ret = write_invalidate_inode_pages_range(mapping, ++ ret = bch2_write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter_count - 1); + if (unlikely(ret)) @@ -53542,7 +53058,6 @@ index 000000000..6b691b2b5 + bch2_dio_write_continue(dio); +} + -+static noinline +ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) +{ + struct file *file = req->ki_filp; @@ -53606,7 +53121,7 @@ index 000000000..6b691b2b5 + dio->op.c = c; + + if (unlikely(mapping->nrpages)) { -+ ret = write_invalidate_inode_pages_range(mapping, ++ ret = bch2_write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter->count - 1); + if (unlikely(ret)) @@ -53625,44 +53140,1196 @@ index 000000000..6b691b2b5 + goto err; +} + -+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) ++void bch2_fs_fs_io_direct_exit(struct bch_fs *c) +{ -+ struct file *file = iocb->ki_filp; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ ssize_t ret; ++ bioset_exit(&c->dio_write_bioset); ++ bioset_exit(&c->dio_read_bioset); ++} + -+ if (iocb->ki_flags & IOCB_DIRECT) { -+ ret = bch2_direct_write(iocb, from); ++int bch2_fs_fs_io_direct_init(struct bch_fs *c) ++{ ++ if (bioset_init(&c->dio_read_bioset, ++ 4, offsetof(struct dio_read, rbio.bio), ++ BIOSET_NEED_BVECS)) ++ return -BCH_ERR_ENOMEM_dio_read_bioset_init; ++ ++ if (bioset_init(&c->dio_write_bioset, ++ 4, offsetof(struct dio_write, op.wbio.bio), ++ BIOSET_NEED_BVECS)) ++ return -BCH_ERR_ENOMEM_dio_write_bioset_init; ++ ++ return 0; ++} ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h +new file mode 100644 +index 000000000..814621ec7 +--- /dev/null ++++ b/fs/bcachefs/fs-io-direct.h +@@ -0,0 +1,16 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_IO_DIRECT_H ++#define _BCACHEFS_FS_IO_DIRECT_H ++ ++#ifndef NO_BCACHEFS_FS ++ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *); ++ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); ++ ++void bch2_fs_fs_io_direct_exit(struct bch_fs *); ++int bch2_fs_fs_io_direct_init(struct bch_fs *); ++#else ++static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {} ++static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; } ++#endif ++ ++#endif /* _BCACHEFS_FS_IO_DIRECT_H */ +diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c +new file mode 100644 +index 000000000..1e60eead2 +--- /dev/null ++++ b/fs/bcachefs/fs-io-pagecache.c +@@ -0,0 +1,788 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "btree_iter.h" ++#include "extents.h" ++#include "fs-io.h" ++#include "fs-io-pagecache.h" ++#include "subvolume.h" ++ ++#include ++#include ++ ++int bch2_filemap_get_contig_folios_d(struct address_space *mapping, ++ loff_t start, u64 end, ++ int fgp_flags, gfp_t gfp, ++ folios *folios) ++{ ++ struct folio *f; ++ u64 pos = start; ++ int ret = 0; ++ ++ while (pos < end) { ++ if ((u64) pos >= (u64) start + (1ULL << 20)) ++ fgp_flags &= ~FGP_CREAT; ++ ++ ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL); ++ if (ret) ++ break; ++ ++ f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); ++ if (IS_ERR_OR_NULL(f)) ++ break; ++ ++ BUG_ON(folios->nr && folio_pos(f) != pos); ++ ++ pos = folio_end_pos(f); ++ darray_push(folios, f); ++ } ++ ++ if (!folios->nr && !ret && (fgp_flags & FGP_CREAT)) ++ ret = -ENOMEM; ++ ++ return folios->nr ? 0 : ret; ++} ++ ++/* pagecache_block must be held */ ++int bch2_write_invalidate_inode_pages_range(struct address_space *mapping, ++ loff_t start, loff_t end) ++{ ++ int ret; ++ ++ /* ++ * XXX: the way this is currently implemented, we can spin if a process ++ * is continually redirtying a specific page ++ */ ++ do { ++ if (!mapping->nrpages) ++ return 0; ++ ++ ret = filemap_write_and_wait_range(mapping, start, end); ++ if (ret) ++ break; ++ ++ if (!mapping->nrpages) ++ return 0; ++ ++ ret = invalidate_inode_pages2_range(mapping, ++ start >> PAGE_SHIFT, ++ end >> PAGE_SHIFT); ++ } while (ret == -EBUSY); ++ ++ return ret; ++} ++ ++static const char * const bch2_folio_sector_states[] = { ++#define x(n) #n, ++ BCH_FOLIO_SECTOR_STATE() ++#undef x ++ NULL ++}; ++ ++static inline enum bch_folio_sector_state ++folio_sector_dirty(enum bch_folio_sector_state state) ++{ ++ switch (state) { ++ case SECTOR_unallocated: ++ return SECTOR_dirty; ++ case SECTOR_reserved: ++ return SECTOR_dirty_reserved; ++ default: ++ return state; ++ } ++} ++ ++static inline enum bch_folio_sector_state ++folio_sector_undirty(enum bch_folio_sector_state state) ++{ ++ switch (state) { ++ case SECTOR_dirty: ++ return SECTOR_unallocated; ++ case SECTOR_dirty_reserved: ++ return SECTOR_reserved; ++ default: ++ return state; ++ } ++} ++ ++static inline enum bch_folio_sector_state ++folio_sector_reserve(enum bch_folio_sector_state state) ++{ ++ switch (state) { ++ case SECTOR_unallocated: ++ return SECTOR_reserved; ++ case SECTOR_dirty: ++ return SECTOR_dirty_reserved; ++ default: ++ return state; ++ } ++} ++ ++/* for newly allocated folios: */ ++struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) ++{ ++ struct bch_folio *s; ++ ++ s = kzalloc(sizeof(*s) + ++ sizeof(struct bch_folio_sector) * ++ folio_sectors(folio), gfp); ++ if (!s) ++ return NULL; ++ ++ spin_lock_init(&s->lock); ++ folio_attach_private(folio, s); ++ return s; ++} ++ ++struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) ++{ ++ return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); ++} ++ ++static unsigned bkey_to_sector_state(struct bkey_s_c k) ++{ ++ if (bkey_extent_is_reservation(k)) ++ return SECTOR_reserved; ++ if (bkey_extent_is_allocation(k.k)) ++ return SECTOR_allocated; ++ return SECTOR_unallocated; ++} ++ ++static void __bch2_folio_set(struct folio *folio, ++ unsigned pg_offset, unsigned pg_len, ++ unsigned nr_ptrs, unsigned state) ++{ ++ struct bch_folio *s = bch2_folio(folio); ++ unsigned i, sectors = folio_sectors(folio); ++ ++ BUG_ON(pg_offset >= sectors); ++ BUG_ON(pg_offset + pg_len > sectors); ++ ++ spin_lock(&s->lock); ++ ++ for (i = pg_offset; i < pg_offset + pg_len; i++) { ++ s->s[i].nr_replicas = nr_ptrs; ++ bch2_folio_sector_set(folio, s, i, state); ++ } ++ ++ if (i == sectors) ++ s->uptodate = true; ++ ++ spin_unlock(&s->lock); ++} ++ ++/* ++ * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the ++ * extents btree: ++ */ ++int bch2_folio_set(struct bch_fs *c, subvol_inum inum, ++ struct folio **folios, unsigned nr_folios) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_folio *s; ++ u64 offset = folio_sector(folios[0]); ++ unsigned folio_idx; ++ u32 snapshot; ++ bool need_set = false; ++ int ret; ++ ++ for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { ++ s = bch2_folio_create(folios[folio_idx], GFP_KERNEL); ++ if (!s) ++ return -ENOMEM; ++ ++ need_set |= !s->uptodate; ++ } ++ ++ if (!need_set) ++ return 0; ++ ++ folio_idx = 0; ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); ++ if (ret) ++ goto err; ++ ++ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, ++ SPOS(inum.inum, offset, snapshot), ++ BTREE_ITER_SLOTS, k, ret) { ++ unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); ++ unsigned state = bkey_to_sector_state(k); ++ ++ while (folio_idx < nr_folios) { ++ struct folio *folio = folios[folio_idx]; ++ u64 folio_start = folio_sector(folio); ++ u64 folio_end = folio_end_sector(folio); ++ unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - ++ folio_start; ++ unsigned folio_len = min(k.k->p.offset, folio_end) - ++ folio_offset - folio_start; ++ ++ BUG_ON(k.k->p.offset < folio_start); ++ BUG_ON(bkey_start_offset(k.k) > folio_end); ++ ++ if (!bch2_folio(folio)->uptodate) ++ __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); ++ ++ if (k.k->p.offset < folio_end) ++ break; ++ folio_idx++; ++ } ++ ++ if (folio_idx == nr_folios) ++ break; ++ } ++ ++ offset = iter.pos.offset; ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) ++{ ++ struct bvec_iter iter; ++ struct folio_vec fv; ++ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v ++ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); ++ unsigned state = bkey_to_sector_state(k); ++ ++ bio_for_each_folio(fv, bio, iter) ++ __bch2_folio_set(fv.fv_folio, ++ fv.fv_offset >> 9, ++ fv.fv_len >> 9, ++ nr_ptrs, state); ++} ++ ++void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, ++ u64 start, u64 end) ++{ ++ pgoff_t index = start >> PAGE_SECTORS_SHIFT; ++ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; ++ struct folio_batch fbatch; ++ unsigned i, j; ++ ++ if (end <= start) ++ return; ++ ++ folio_batch_init(&fbatch); ++ ++ while (filemap_get_folios(inode->v.i_mapping, ++ &index, end_index, &fbatch)) { ++ for (i = 0; i < folio_batch_count(&fbatch); i++) { ++ struct folio *folio = fbatch.folios[i]; ++ u64 folio_start = folio_sector(folio); ++ u64 folio_end = folio_end_sector(folio); ++ unsigned folio_offset = max(start, folio_start) - folio_start; ++ unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; ++ struct bch_folio *s; ++ ++ BUG_ON(end <= folio_start); ++ ++ folio_lock(folio); ++ s = bch2_folio(folio); ++ ++ if (s) { ++ spin_lock(&s->lock); ++ for (j = folio_offset; j < folio_offset + folio_len; j++) ++ s->s[j].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ ++ folio_unlock(folio); ++ } ++ folio_batch_release(&fbatch); ++ cond_resched(); ++ } ++} ++ ++void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, ++ u64 start, u64 end) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ pgoff_t index = start >> PAGE_SECTORS_SHIFT; ++ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; ++ struct folio_batch fbatch; ++ s64 i_sectors_delta = 0; ++ unsigned i, j; ++ ++ if (end <= start) ++ return; ++ ++ folio_batch_init(&fbatch); ++ ++ while (filemap_get_folios(inode->v.i_mapping, ++ &index, end_index, &fbatch)) { ++ for (i = 0; i < folio_batch_count(&fbatch); i++) { ++ struct folio *folio = fbatch.folios[i]; ++ u64 folio_start = folio_sector(folio); ++ u64 folio_end = folio_end_sector(folio); ++ unsigned folio_offset = max(start, folio_start) - folio_start; ++ unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; ++ struct bch_folio *s; ++ ++ BUG_ON(end <= folio_start); ++ ++ folio_lock(folio); ++ s = bch2_folio(folio); ++ ++ if (s) { ++ spin_lock(&s->lock); ++ for (j = folio_offset; j < folio_offset + folio_len; j++) { ++ i_sectors_delta -= s->s[j].state == SECTOR_dirty; ++ bch2_folio_sector_set(folio, s, j, ++ folio_sector_reserve(s->s[j].state)); ++ } ++ spin_unlock(&s->lock); ++ } ++ ++ folio_unlock(folio); ++ } ++ folio_batch_release(&fbatch); ++ cond_resched(); ++ } ++ ++ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); ++} ++ ++static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, ++ unsigned nr_replicas) ++{ ++ return max(0, (int) nr_replicas - ++ s->nr_replicas - ++ s->replicas_reserved); ++} ++ ++int bch2_get_folio_disk_reservation(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct folio *folio, bool check_enospc) ++{ ++ struct bch_folio *s = bch2_folio_create(folio, 0); ++ unsigned nr_replicas = inode_nr_replicas(c, inode); ++ struct disk_reservation disk_res = { 0 }; ++ unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; ++ int ret; ++ ++ if (!s) ++ return -ENOMEM; ++ ++ for (i = 0; i < sectors; i++) ++ disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); ++ ++ if (!disk_res_sectors) ++ return 0; ++ ++ ret = bch2_disk_reservation_get(c, &disk_res, ++ disk_res_sectors, 1, ++ !check_enospc ++ ? BCH_DISK_RESERVATION_NOFAIL ++ : 0); ++ if (unlikely(ret)) ++ return ret; ++ ++ for (i = 0; i < sectors; i++) ++ s->s[i].replicas_reserved += ++ sectors_to_reserve(&s->s[i], nr_replicas); ++ ++ return 0; ++} ++ ++void bch2_folio_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch2_folio_reservation *res) ++{ ++ bch2_disk_reservation_put(c, &res->disk); ++ bch2_quota_reservation_put(c, inode, &res->quota); ++} ++ ++int bch2_folio_reservation_get(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct folio *folio, ++ struct bch2_folio_reservation *res, ++ unsigned offset, unsigned len) ++{ ++ struct bch_folio *s = bch2_folio_create(folio, 0); ++ unsigned i, disk_sectors = 0, quota_sectors = 0; ++ int ret; ++ ++ if (!s) ++ return -ENOMEM; ++ ++ BUG_ON(!s->uptodate); ++ ++ for (i = round_down(offset, block_bytes(c)) >> 9; ++ i < round_up(offset + len, block_bytes(c)) >> 9; ++ i++) { ++ disk_sectors += sectors_to_reserve(&s->s[i], ++ res->disk.nr_replicas); ++ quota_sectors += s->s[i].state == SECTOR_unallocated; ++ } ++ ++ if (disk_sectors) { ++ ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); ++ if (unlikely(ret)) ++ return ret; ++ } ++ ++ if (quota_sectors) { ++ ret = bch2_quota_reservation_add(c, inode, &res->quota, ++ quota_sectors, true); ++ if (unlikely(ret)) { ++ struct disk_reservation tmp = { ++ .sectors = disk_sectors ++ }; ++ ++ bch2_disk_reservation_put(c, &tmp); ++ res->disk.sectors -= disk_sectors; ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ ++static void bch2_clear_folio_bits(struct folio *folio) ++{ ++ struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_folio *s = bch2_folio(folio); ++ struct disk_reservation disk_res = { 0 }; ++ int i, sectors = folio_sectors(folio), dirty_sectors = 0; ++ ++ if (!s) ++ return; ++ ++ EBUG_ON(!folio_test_locked(folio)); ++ EBUG_ON(folio_test_writeback(folio)); ++ ++ for (i = 0; i < sectors; i++) { ++ disk_res.sectors += s->s[i].replicas_reserved; ++ s->s[i].replicas_reserved = 0; ++ ++ dirty_sectors -= s->s[i].state == SECTOR_dirty; ++ bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); ++ } ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ bch2_i_sectors_acct(c, inode, NULL, dirty_sectors); ++ ++ bch2_folio_release(folio); ++} ++ ++void bch2_set_folio_dirty(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct folio *folio, ++ struct bch2_folio_reservation *res, ++ unsigned offset, unsigned len) ++{ ++ struct bch_folio *s = bch2_folio(folio); ++ unsigned i, dirty_sectors = 0; ++ ++ WARN_ON((u64) folio_pos(folio) + offset + len > ++ round_up((u64) i_size_read(&inode->v), block_bytes(c))); ++ ++ BUG_ON(!s->uptodate); ++ ++ spin_lock(&s->lock); ++ ++ for (i = round_down(offset, block_bytes(c)) >> 9; ++ i < round_up(offset + len, block_bytes(c)) >> 9; ++ i++) { ++ unsigned sectors = sectors_to_reserve(&s->s[i], ++ res->disk.nr_replicas); ++ ++ /* ++ * This can happen if we race with the error path in ++ * bch2_writepage_io_done(): ++ */ ++ sectors = min_t(unsigned, sectors, res->disk.sectors); ++ ++ s->s[i].replicas_reserved += sectors; ++ res->disk.sectors -= sectors; ++ ++ dirty_sectors += s->s[i].state == SECTOR_unallocated; ++ ++ bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); ++ } ++ ++ spin_unlock(&s->lock); ++ ++ bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors); ++ ++ if (!folio_test_dirty(folio)) ++ filemap_dirty_folio(inode->v.i_mapping, folio); ++} ++ ++vm_fault_t bch2_page_fault(struct vm_fault *vmf) ++{ ++ struct file *file = vmf->vma->vm_file; ++ struct address_space *mapping = file->f_mapping; ++ struct address_space *fdm = faults_disabled_mapping(); ++ struct bch_inode_info *inode = file_bch_inode(file); ++ vm_fault_t ret; ++ ++ if (fdm == mapping) ++ return VM_FAULT_SIGBUS; ++ ++ /* Lock ordering: */ ++ if (fdm > mapping) { ++ struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); ++ ++ if (bch2_pagecache_add_tryget(inode)) ++ goto got_lock; ++ ++ bch2_pagecache_block_put(fdm_host); ++ ++ bch2_pagecache_add_get(inode); ++ bch2_pagecache_add_put(inode); ++ ++ bch2_pagecache_block_get(fdm_host); ++ ++ /* Signal that lock has been dropped: */ ++ set_fdm_dropped_locks(); ++ return VM_FAULT_SIGBUS; ++ } ++ ++ bch2_pagecache_add_get(inode); ++got_lock: ++ ret = filemap_fault(vmf); ++ bch2_pagecache_add_put(inode); ++ ++ return ret; ++} ++ ++vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) ++{ ++ struct folio *folio = page_folio(vmf->page); ++ struct file *file = vmf->vma->vm_file; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct address_space *mapping = file->f_mapping; ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_folio_reservation res; ++ unsigned len; ++ loff_t isize; ++ vm_fault_t ret; ++ ++ bch2_folio_reservation_init(c, inode, &res); ++ ++ sb_start_pagefault(inode->v.i_sb); ++ file_update_time(file); ++ ++ /* ++ * Not strictly necessary, but helps avoid dio writes livelocking in ++ * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get ++ * a bch2_write_invalidate_inode_pages_range() that works without dropping ++ * page lock before invalidating page ++ */ ++ bch2_pagecache_add_get(inode); ++ ++ folio_lock(folio); ++ isize = i_size_read(&inode->v); ++ ++ if (folio->mapping != mapping || folio_pos(folio) >= isize) { ++ folio_unlock(folio); ++ ret = VM_FAULT_NOPAGE; + goto out; + } + -+ /* We can write back this queue in page reclaim */ -+ current->backing_dev_info = inode_to_bdi(&inode->v); -+ inode_lock(&inode->v); ++ len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); + -+ ret = generic_write_checks(iocb, from); -+ if (ret <= 0) -+ goto unlock; ++ if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: ++ bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { ++ folio_unlock(folio); ++ ret = VM_FAULT_SIGBUS; ++ goto out; ++ } + -+ ret = file_remove_privs(file); -+ if (ret) -+ goto unlock; ++ bch2_set_folio_dirty(c, inode, folio, &res, 0, len); ++ bch2_folio_reservation_put(c, inode, &res); + -+ ret = file_update_time(file); -+ if (ret) -+ goto unlock; -+ -+ ret = bch2_buffered_write(iocb, from); -+ if (likely(ret > 0)) -+ iocb->ki_pos += ret; -+unlock: -+ inode_unlock(&inode->v); -+ current->backing_dev_info = NULL; -+ -+ if (ret > 0) -+ ret = generic_write_sync(iocb, ret); ++ folio_wait_stable(folio); ++ ret = VM_FAULT_LOCKED; +out: -+ return bch2_err_class(ret); ++ bch2_pagecache_add_put(inode); ++ sb_end_pagefault(inode->v.i_sb); ++ ++ return ret; ++} ++ ++void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) ++{ ++ if (offset || length < folio_size(folio)) ++ return; ++ ++ bch2_clear_folio_bits(folio); ++} ++ ++bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) ++{ ++ if (folio_test_dirty(folio) || folio_test_writeback(folio)) ++ return false; ++ ++ bch2_clear_folio_bits(folio); ++ return true; ++} ++ ++/* fseek: */ ++ ++static int folio_data_offset(struct folio *folio, loff_t pos, ++ unsigned min_replicas) ++{ ++ struct bch_folio *s = bch2_folio(folio); ++ unsigned i, sectors = folio_sectors(folio); ++ ++ if (s) ++ for (i = folio_pos_to_s(folio, pos); i < sectors; i++) ++ if (s->s[i].state >= SECTOR_dirty && ++ s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) ++ return i << SECTOR_SHIFT; ++ ++ return -1; ++} ++ ++loff_t bch2_seek_pagecache_data(struct inode *vinode, ++ loff_t start_offset, ++ loff_t end_offset, ++ unsigned min_replicas, ++ bool nonblock) ++{ ++ struct folio_batch fbatch; ++ pgoff_t start_index = start_offset >> PAGE_SHIFT; ++ pgoff_t end_index = end_offset >> PAGE_SHIFT; ++ pgoff_t index = start_index; ++ unsigned i; ++ loff_t ret; ++ int offset; ++ ++ folio_batch_init(&fbatch); ++ ++ while (filemap_get_folios(vinode->i_mapping, ++ &index, end_index, &fbatch)) { ++ for (i = 0; i < folio_batch_count(&fbatch); i++) { ++ struct folio *folio = fbatch.folios[i]; ++ ++ if (!nonblock) { ++ folio_lock(folio); ++ } else if (!folio_trylock(folio)) { ++ folio_batch_release(&fbatch); ++ return -EAGAIN; ++ } ++ ++ offset = folio_data_offset(folio, ++ max(folio_pos(folio), start_offset), ++ min_replicas); ++ if (offset >= 0) { ++ ret = clamp(folio_pos(folio) + offset, ++ start_offset, end_offset); ++ folio_unlock(folio); ++ folio_batch_release(&fbatch); ++ return ret; ++ } ++ folio_unlock(folio); ++ } ++ folio_batch_release(&fbatch); ++ cond_resched(); ++ } ++ ++ return end_offset; ++} ++ ++/* ++ * Search for a hole in a folio. ++ * ++ * The filemap layer returns -ENOENT if no folio exists, so reuse the same error ++ * code to indicate a pagecache hole exists at the returned offset. Otherwise ++ * return 0 if the folio is filled with data, or an error code. This function ++ * can return -EAGAIN if nonblock is specified. ++ */ ++static int folio_hole_offset(struct address_space *mapping, loff_t *offset, ++ unsigned min_replicas, bool nonblock) ++{ ++ struct folio *folio; ++ struct bch_folio *s; ++ unsigned i, sectors; ++ int ret = -ENOENT; ++ ++ folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT, ++ FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0); ++ if (IS_ERR(folio)) ++ return PTR_ERR(folio); ++ ++ s = bch2_folio(folio); ++ if (!s) ++ goto unlock; ++ ++ sectors = folio_sectors(folio); ++ for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) ++ if (s->s[i].state < SECTOR_dirty || ++ s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { ++ *offset = max(*offset, ++ folio_pos(folio) + (i << SECTOR_SHIFT)); ++ goto unlock; ++ } ++ ++ *offset = folio_end_pos(folio); ++ ret = 0; ++unlock: ++ folio_unlock(folio); ++ folio_put(folio); ++ return ret; ++} ++ ++loff_t bch2_seek_pagecache_hole(struct inode *vinode, ++ loff_t start_offset, ++ loff_t end_offset, ++ unsigned min_replicas, ++ bool nonblock) ++{ ++ struct address_space *mapping = vinode->i_mapping; ++ loff_t offset = start_offset; ++ loff_t ret = 0; ++ ++ while (!ret && offset < end_offset) ++ ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock); ++ ++ if (ret && ret != -ENOENT) ++ return ret; ++ return min(offset, end_offset); ++} ++ ++int bch2_clamp_data_hole(struct inode *inode, ++ u64 *hole_start, ++ u64 *hole_end, ++ unsigned min_replicas, ++ bool nonblock) ++{ ++ loff_t ret; ++ ++ ret = bch2_seek_pagecache_hole(inode, ++ *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; ++ if (ret < 0) ++ return ret; ++ ++ *hole_start = ret; ++ ++ if (*hole_start == *hole_end) ++ return 0; ++ ++ ret = bch2_seek_pagecache_data(inode, ++ *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; ++ if (ret < 0) ++ return ret; ++ ++ *hole_end = ret; ++ return 0; ++} ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h +new file mode 100644 +index 000000000..a2222ad58 +--- /dev/null ++++ b/fs/bcachefs/fs-io-pagecache.h +@@ -0,0 +1,176 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_IO_PAGECACHE_H ++#define _BCACHEFS_FS_IO_PAGECACHE_H ++ ++#include ++ ++typedef DARRAY(struct folio *) folios; ++ ++int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t, ++ u64, int, gfp_t, folios *); ++int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t); ++ ++/* ++ * Use u64 for the end pos and sector helpers because if the folio covers the ++ * max supported range of the mapping, the start offset of the next folio ++ * overflows loff_t. This breaks much of the range based processing in the ++ * buffered write path. ++ */ ++static inline u64 folio_end_pos(struct folio *folio) ++{ ++ return folio_pos(folio) + folio_size(folio); ++} ++ ++static inline size_t folio_sectors(struct folio *folio) ++{ ++ return PAGE_SECTORS << folio_order(folio); ++} ++ ++static inline loff_t folio_sector(struct folio *folio) ++{ ++ return folio_pos(folio) >> 9; ++} ++ ++static inline u64 folio_end_sector(struct folio *folio) ++{ ++ return folio_end_pos(folio) >> 9; ++} ++ ++#define BCH_FOLIO_SECTOR_STATE() \ ++ x(unallocated) \ ++ x(reserved) \ ++ x(dirty) \ ++ x(dirty_reserved) \ ++ x(allocated) ++ ++enum bch_folio_sector_state { ++#define x(n) SECTOR_##n, ++ BCH_FOLIO_SECTOR_STATE() ++#undef x ++}; ++ ++struct bch_folio_sector { ++ /* Uncompressed, fully allocated replicas (or on disk reservation): */ ++ unsigned nr_replicas:4; ++ ++ /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ ++ unsigned replicas_reserved:4; ++ ++ /* i_sectors: */ ++ enum bch_folio_sector_state state:8; ++}; ++ ++struct bch_folio { ++ spinlock_t lock; ++ atomic_t write_count; ++ /* ++ * Is the sector state up to date with the btree? ++ * (Not the data itself) ++ */ ++ bool uptodate; ++ struct bch_folio_sector s[]; ++}; ++ ++/* Helper for when we need to add debug instrumentation: */ ++static inline void bch2_folio_sector_set(struct folio *folio, ++ struct bch_folio *s, ++ unsigned i, unsigned n) ++{ ++ s->s[i].state = n; ++} ++ ++/* file offset (to folio offset) to bch_folio_sector index */ ++static inline int folio_pos_to_s(struct folio *folio, loff_t pos) ++{ ++ u64 f_offset = pos - folio_pos(folio); ++ ++ BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); ++ return f_offset >> SECTOR_SHIFT; ++} ++ ++/* for newly allocated folios: */ ++static inline void __bch2_folio_release(struct folio *folio) ++{ ++ kfree(folio_detach_private(folio)); ++} ++ ++static inline void bch2_folio_release(struct folio *folio) ++{ ++ EBUG_ON(!folio_test_locked(folio)); ++ __bch2_folio_release(folio); ++} ++ ++static inline struct bch_folio *__bch2_folio(struct folio *folio) ++{ ++ return folio_has_private(folio) ++ ? (struct bch_folio *) folio_get_private(folio) ++ : NULL; ++} ++ ++static inline struct bch_folio *bch2_folio(struct folio *folio) ++{ ++ EBUG_ON(!folio_test_locked(folio)); ++ ++ return __bch2_folio(folio); ++} ++ ++struct bch_folio *__bch2_folio_create(struct folio *, gfp_t); ++struct bch_folio *bch2_folio_create(struct folio *, gfp_t); ++ ++struct bch2_folio_reservation { ++ struct disk_reservation disk; ++ struct quota_res quota; ++}; ++ ++static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) ++{ ++ /* XXX: this should not be open coded */ ++ return inode->ei_inode.bi_data_replicas ++ ? inode->ei_inode.bi_data_replicas - 1 ++ : c->opts.data_replicas; ++} ++ ++static inline void bch2_folio_reservation_init(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch2_folio_reservation *res) ++{ ++ memset(res, 0, sizeof(*res)); ++ ++ res->disk.nr_replicas = inode_nr_replicas(c, inode); ++} ++ ++int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned); ++void bch2_bio_page_state_set(struct bio *, struct bkey_s_c); ++ ++void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64); ++void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64); ++ ++int bch2_get_folio_disk_reservation(struct bch_fs *, ++ struct bch_inode_info *, ++ struct folio *, bool); ++ ++void bch2_folio_reservation_put(struct bch_fs *, ++ struct bch_inode_info *, ++ struct bch2_folio_reservation *); ++int bch2_folio_reservation_get(struct bch_fs *, ++ struct bch_inode_info *, ++ struct folio *, ++ struct bch2_folio_reservation *, ++ unsigned, unsigned); ++ ++void bch2_set_folio_dirty(struct bch_fs *, ++ struct bch_inode_info *, ++ struct folio *, ++ struct bch2_folio_reservation *, ++ unsigned, unsigned); ++ ++vm_fault_t bch2_page_fault(struct vm_fault *); ++vm_fault_t bch2_page_mkwrite(struct vm_fault *); ++void bch2_invalidate_folio(struct folio *, size_t, size_t); ++bool bch2_release_folio(struct folio *, gfp_t); ++ ++loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool); ++loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool); ++int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool); ++ ++#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */ +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +new file mode 100644 +index 000000000..ceab12fb8 +--- /dev/null ++++ b/fs/bcachefs/fs-io.c +@@ -0,0 +1,1250 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "clock.h" ++#include "error.h" ++#include "extents.h" ++#include "extent_update.h" ++#include "fs.h" ++#include "fs-io.h" ++#include "fs-io-buffered.h" ++#include "fs-io-pagecache.h" ++#include "fsck.h" ++#include "inode.h" ++#include "journal.h" ++#include "io.h" ++#include "keylist.h" ++#include "quota.h" ++#include "reflink.h" ++#include "trace.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++struct nocow_flush { ++ struct closure *cl; ++ struct bch_dev *ca; ++ struct bio bio; ++}; ++ ++static void nocow_flush_endio(struct bio *_bio) ++{ ++ ++ struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); ++ ++ closure_put(bio->cl); ++ percpu_ref_put(&bio->ca->io_ref); ++ bio_put(&bio->bio); ++} ++ ++void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct closure *cl) ++{ ++ struct nocow_flush *bio; ++ struct bch_dev *ca; ++ struct bch_devs_mask devs; ++ unsigned dev; ++ ++ dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); ++ if (dev == BCH_SB_MEMBERS_MAX) ++ return; ++ ++ devs = inode->ei_devs_need_flush; ++ memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); ++ ++ for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { ++ rcu_read_lock(); ++ ca = rcu_dereference(c->devs[dev]); ++ if (ca && !percpu_ref_tryget(&ca->io_ref)) ++ ca = NULL; ++ rcu_read_unlock(); ++ ++ if (!ca) ++ continue; ++ ++ bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, ++ REQ_OP_FLUSH, ++ GFP_KERNEL, ++ &c->nocow_flush_bioset), ++ struct nocow_flush, bio); ++ bio->cl = cl; ++ bio->ca = ca; ++ bio->bio.bi_end_io = nocow_flush_endio; ++ closure_bio_submit(&bio->bio, cl); ++ } ++} ++ ++static int bch2_inode_flush_nocow_writes(struct bch_fs *c, ++ struct bch_inode_info *inode) ++{ ++ struct closure cl; ++ ++ closure_init_stack(&cl); ++ bch2_inode_flush_nocow_writes_async(c, inode, &cl); ++ closure_sync(&cl); ++ ++ return 0; ++} ++ ++/* i_size updates: */ ++ ++struct inode_new_size { ++ loff_t new_size; ++ u64 now; ++ unsigned fields; ++}; ++ ++static int inode_set_size(struct btree_trans *trans, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct inode_new_size *s = p; ++ ++ bi->bi_size = s->new_size; ++ if (s->fields & ATTR_ATIME) ++ bi->bi_atime = s->now; ++ if (s->fields & ATTR_MTIME) ++ bi->bi_mtime = s->now; ++ if (s->fields & ATTR_CTIME) ++ bi->bi_ctime = s->now; ++ ++ return 0; ++} ++ ++int __must_check bch2_write_inode_size(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ loff_t new_size, unsigned fields) ++{ ++ struct inode_new_size s = { ++ .new_size = new_size, ++ .now = bch2_current_time(c), ++ .fields = fields, ++ }; ++ ++ return bch2_write_inode(c, inode, inode_set_size, &s, fields); ++} ++ ++void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, ++ struct quota_res *quota_res, s64 sectors) ++{ ++ bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, ++ "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", ++ inode->v.i_ino, (u64) inode->v.i_blocks, sectors, ++ inode->ei_inode.bi_sectors); ++ inode->v.i_blocks += sectors; ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ if (quota_res && ++ !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && ++ sectors > 0) { ++ BUG_ON(sectors > quota_res->sectors); ++ BUG_ON(sectors > inode->ei_quota_reserved); ++ ++ quota_res->sectors -= sectors; ++ inode->ei_quota_reserved -= sectors; ++ } else { ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); ++ } ++#endif +} + +/* fsync: */ @@ -53763,7 +54430,7 @@ index 000000000..6b691b2b5 + + folio = __filemap_get_folio(mapping, index, + FGP_LOCK|FGP_CREAT, GFP_KERNEL); -+ if (unlikely(IS_ERR_OR_NULL(folio))) { ++ if (IS_ERR_OR_NULL(folio)) { + ret = -ENOMEM; + goto out; + } @@ -53804,10 +54471,10 @@ index 000000000..6b691b2b5 + s->s[i].nr_replicas = 0; + + i_sectors_delta -= s->s[i].state == SECTOR_dirty; -+ folio_sector_set(folio, s, i, SECTOR_unallocated); ++ bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); + } + -+ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + + /* + * Caller needs to know whether this folio will be written out by @@ -53891,7 +54558,8 @@ index 000000000..6b691b2b5 + return bch2_setattr_nonsize(idmap, inode, iattr); +} + -+static int bch2_truncate_finish_fn(struct bch_inode_info *inode, ++static int bch2_truncate_finish_fn(struct btree_trans *trans, ++ struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ @@ -53899,7 +54567,8 @@ index 000000000..6b691b2b5 + return 0; +} + -+static int bch2_truncate_start_fn(struct bch_inode_info *inode, ++static int bch2_truncate_start_fn(struct btree_trans *trans, ++ struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, void *p) +{ + u64 *new_i_size = p; @@ -53998,7 +54667,7 @@ index 000000000..6b691b2b5 + ret = bch2_fpunch(c, inode_inum(inode), + round_up(iattr->ia_size, block_bytes(c)) >> 9, + U64_MAX, &i_sectors_delta); -+ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + + bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && + !bch2_journal_error(&c->journal), c, @@ -54020,7 +54689,8 @@ index 000000000..6b691b2b5 + +/* fallocate: */ + -+static int inode_update_times_fn(struct bch_inode_info *inode, ++static int inode_update_times_fn(struct btree_trans *trans, ++ struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -54052,7 +54722,7 @@ index 000000000..6b691b2b5 + ret = bch2_fpunch(c, inode_inum(inode), + block_start >> 9, block_end >> 9, + &i_sectors_delta); -+ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + } + + mutex_lock(&inode->ei_update_lock); @@ -54103,7 +54773,7 @@ index 000000000..6b691b2b5 + + new_size = inode->v.i_size + shift; + -+ ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); ++ ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); + if (ret) + return ret; + @@ -54119,7 +54789,7 @@ index 000000000..6b691b2b5 + ret = bch2_fpunch(c, inode_inum(inode), + offset >> 9, (offset + len) >> 9, + &i_sectors_delta); -+ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); + + if (ret) + return ret; @@ -54303,11 +54973,19 @@ index 000000000..6b691b2b5 + } + + if (!(mode & FALLOC_FL_ZERO_RANGE)) { -+ ret = drop_locks_do(&trans, -+ (bch2_clamp_data_hole(&inode->v, -+ &hole_start, -+ &hole_end, -+ opts.data_replicas), 0)); ++ /* ++ * Lock ordering - can't be holding btree locks while ++ * blocking on a folio lock: ++ */ ++ if (bch2_clamp_data_hole(&inode->v, ++ &hole_start, ++ &hole_end, ++ opts.data_replicas, true)) ++ ret = drop_locks_do(&trans, ++ (bch2_clamp_data_hole(&inode->v, ++ &hole_start, ++ &hole_end, ++ opts.data_replicas, false), 0)); + bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); + + if (ret) @@ -54332,10 +55010,10 @@ index 000000000..6b691b2b5 + if (ret) + goto bkey_err; + -+ i_sectors_acct(c, inode, "a_res, i_sectors_delta); ++ bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); + + drop_locks_do(&trans, -+ (mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); ++ (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); +bkey_err: + bch2_quota_reservation_put(c, inode, "a_res); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -54348,7 +55026,7 @@ index 000000000..6b691b2b5 + + bch2_fpunch_at(&trans, &iter, inode_inum(inode), + end_sector, &i_sectors_delta); -+ i_sectors_acct(c, inode, "a_res, i_sectors_delta); ++ bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); + bch2_quota_reservation_put(c, inode, "a_res); + } + @@ -54542,7 +55220,7 @@ index 000000000..6b691b2b5 + + aligned_len = round_up((u64) len, block_bytes(c)); + -+ ret = write_invalidate_inode_pages_range(dst->v.i_mapping, ++ ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, + pos_dst, pos_dst + len - 1); + if (ret) + goto err; @@ -54554,7 +55232,7 @@ index 000000000..6b691b2b5 + + file_update_time(file_dst); + -+ mark_pagecache_unallocated(src, pos_src >> 9, ++ bch2_mark_pagecache_unallocated(src, pos_src >> 9, + (pos_src + aligned_len) >> 9); + + ret = bch2_remap_range(c, @@ -54570,7 +55248,7 @@ index 000000000..6b691b2b5 + */ + ret = min((u64) ret << 9, (u64) len); + -+ i_sectors_acct(c, dst, "a_res, i_sectors_delta); ++ bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); + + spin_lock(&dst->v.i_lock); + if (pos_dst + ret > dst->v.i_size) @@ -54589,61 +55267,6 @@ index 000000000..6b691b2b5 + +/* fseek: */ + -+static int folio_data_offset(struct folio *folio, loff_t pos, -+ unsigned min_replicas) -+{ -+ struct bch_folio *s = bch2_folio(folio); -+ unsigned i, sectors = folio_sectors(folio); -+ -+ if (s) -+ for (i = folio_pos_to_s(folio, pos); i < sectors; i++) -+ if (s->s[i].state >= SECTOR_dirty && -+ s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) -+ return i << SECTOR_SHIFT; -+ -+ return -1; -+} -+ -+static loff_t bch2_seek_pagecache_data(struct inode *vinode, -+ loff_t start_offset, -+ loff_t end_offset, -+ unsigned min_replicas) -+{ -+ struct folio_batch fbatch; -+ pgoff_t start_index = start_offset >> PAGE_SHIFT; -+ pgoff_t end_index = end_offset >> PAGE_SHIFT; -+ pgoff_t index = start_index; -+ unsigned i; -+ loff_t ret; -+ int offset; -+ -+ folio_batch_init(&fbatch); -+ -+ while (filemap_get_folios(vinode->i_mapping, -+ &index, end_index, &fbatch)) { -+ for (i = 0; i < folio_batch_count(&fbatch); i++) { -+ struct folio *folio = fbatch.folios[i]; -+ -+ folio_lock(folio); -+ offset = folio_data_offset(folio, -+ max(folio_pos(folio), start_offset), -+ min_replicas); -+ if (offset >= 0) { -+ ret = clamp(folio_pos(folio) + offset, -+ start_offset, end_offset); -+ folio_unlock(folio); -+ folio_batch_release(&fbatch); -+ return ret; -+ } -+ folio_unlock(folio); -+ } -+ folio_batch_release(&fbatch); -+ cond_resched(); -+ } -+ -+ return end_offset; -+} -+ +static loff_t bch2_seek_data(struct file *file, u64 offset) +{ + struct bch_inode_info *inode = file_bch_inode(file); @@ -54689,7 +55312,7 @@ index 000000000..6b691b2b5 + + if (next_data > offset) + next_data = bch2_seek_pagecache_data(&inode->v, -+ offset, next_data, 0); ++ offset, next_data, 0, false); + + if (next_data >= isize) + return -ENXIO; @@ -54697,68 +55320,6 @@ index 000000000..6b691b2b5 + return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); +} + -+static bool folio_hole_offset(struct address_space *mapping, loff_t *offset, -+ unsigned min_replicas) -+{ -+ struct folio *folio; -+ struct bch_folio *s; -+ unsigned i, sectors; -+ bool ret = true; -+ -+ folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT); -+ if (IS_ERR_OR_NULL(folio)) -+ return true; -+ -+ s = bch2_folio(folio); -+ if (!s) -+ goto unlock; -+ -+ sectors = folio_sectors(folio); -+ for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) -+ if (s->s[i].state < SECTOR_dirty || -+ s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { -+ *offset = max(*offset, -+ folio_pos(folio) + (i << SECTOR_SHIFT)); -+ goto unlock; -+ } -+ -+ *offset = folio_end_pos(folio); -+ ret = false; -+unlock: -+ folio_unlock(folio); -+ return ret; -+} -+ -+static loff_t bch2_seek_pagecache_hole(struct inode *vinode, -+ loff_t start_offset, -+ loff_t end_offset, -+ unsigned min_replicas) -+{ -+ struct address_space *mapping = vinode->i_mapping; -+ loff_t offset = start_offset; -+ -+ while (offset < end_offset && -+ !folio_hole_offset(mapping, &offset, min_replicas)) -+ ; -+ -+ return min(offset, end_offset); -+} -+ -+static void bch2_clamp_data_hole(struct inode *inode, -+ u64 *hole_start, -+ u64 *hole_end, -+ unsigned min_replicas) -+{ -+ *hole_start = bch2_seek_pagecache_hole(inode, -+ *hole_start << 9, *hole_end << 9, min_replicas) >> 9; -+ -+ if (*hole_start == *hole_end) -+ return; -+ -+ *hole_end = bch2_seek_pagecache_data(inode, -+ *hole_start << 9, *hole_end << 9, min_replicas) >> 9; -+} -+ +static loff_t bch2_seek_hole(struct file *file, u64 offset) +{ + struct bch_inode_info *inode = file_bch_inode(file); @@ -54788,12 +55349,12 @@ index 000000000..6b691b2b5 + BTREE_ITER_SLOTS, k, ret) { + if (k.k->p.inode != inode->v.i_ino) { + next_hole = bch2_seek_pagecache_hole(&inode->v, -+ offset, MAX_LFS_FILESIZE, 0); ++ offset, MAX_LFS_FILESIZE, 0, false); + break; + } else if (!bkey_extent_is_data(k.k)) { + next_hole = bch2_seek_pagecache_hole(&inode->v, + max(offset, bkey_start_offset(k.k) << 9), -+ k.k->p.offset << 9, 0); ++ k.k->p.offset << 9, 0, false); + + if (next_hole < k.k->p.offset << 9) + break; @@ -54843,28 +55404,10 @@ index 000000000..6b691b2b5 +void bch2_fs_fsio_exit(struct bch_fs *c) +{ + bioset_exit(&c->nocow_flush_bioset); -+ bioset_exit(&c->dio_write_bioset); -+ bioset_exit(&c->dio_read_bioset); -+ bioset_exit(&c->writepage_bioset); +} + +int bch2_fs_fsio_init(struct bch_fs *c) +{ -+ if (bioset_init(&c->writepage_bioset, -+ 4, offsetof(struct bch_writepage_io, op.wbio.bio), -+ BIOSET_NEED_BVECS)) -+ return -BCH_ERR_ENOMEM_writepage_bioset_init; -+ -+ if (bioset_init(&c->dio_read_bioset, -+ 4, offsetof(struct dio_read, rbio.bio), -+ BIOSET_NEED_BVECS)) -+ return -BCH_ERR_ENOMEM_dio_read_bioset_init; -+ -+ if (bioset_init(&c->dio_write_bioset, -+ 4, offsetof(struct dio_write, op.wbio.bio), -+ BIOSET_NEED_BVECS)) -+ return -BCH_ERR_ENOMEM_dio_write_bioset_init; -+ + if (bioset_init(&c->nocow_flush_bioset, + 1, offsetof(struct nocow_flush, bio), 0)) + return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; @@ -54875,10 +55418,10 @@ index 000000000..6b691b2b5 +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h new file mode 100644 -index 000000000..af9053315 +index 000000000..bb5b709fa --- /dev/null +++ b/fs/bcachefs/fs-io.h -@@ -0,0 +1,54 @@ +@@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IO_H +#define _BCACHEFS_FS_IO_H @@ -54886,29 +55429,164 @@ index 000000000..af9053315 +#ifndef NO_BCACHEFS_FS + +#include "buckets.h" ++#include "fs.h" +#include "io_types.h" ++#include "quota.h" + +#include + -+struct quota_res; ++struct folio_vec { ++ struct folio *fv_folio; ++ size_t fv_offset; ++ size_t fv_len; ++}; ++ ++static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) ++{ ++ ++ struct folio *folio = page_folio(bv.bv_page); ++ size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + ++ bv.bv_offset; ++ size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); ++ ++ return (struct folio_vec) { ++ .fv_folio = folio, ++ .fv_offset = offset, ++ .fv_len = len, ++ }; ++} ++ ++static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, ++ struct bvec_iter iter) ++{ ++ return biovec_to_foliovec(bio_iter_iovec(bio, iter)); ++} ++ ++#define __bio_for_each_folio(bvl, bio, iter, start) \ ++ for (iter = (start); \ ++ (iter).bi_size && \ ++ ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ ++ bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) ++ ++/** ++ * bio_for_each_folio - iterate over folios within a bio ++ * ++ * Like other non-_all versions, this iterates over what bio->bi_iter currently ++ * points to. This version is for drivers, where the bio may have previously ++ * been split or cloned. ++ */ ++#define bio_for_each_folio(bvl, bio, iter) \ ++ __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) ++ ++struct quota_res { ++ u64 sectors; ++}; ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++static inline void __bch2_quota_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res) ++{ ++ BUG_ON(res->sectors > inode->ei_quota_reserved); ++ ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, ++ -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); ++ inode->ei_quota_reserved -= res->sectors; ++ res->sectors = 0; ++} ++ ++static inline void bch2_quota_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res) ++{ ++ if (res->sectors) { ++ mutex_lock(&inode->ei_quota_lock); ++ __bch2_quota_reservation_put(c, inode, res); ++ mutex_unlock(&inode->ei_quota_lock); ++ } ++} ++ ++static inline int bch2_quota_reservation_add(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res, ++ u64 sectors, ++ bool check_enospc) ++{ ++ int ret; ++ ++ if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) ++ return 0; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, ++ check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); ++ if (likely(!ret)) { ++ inode->ei_quota_reserved += sectors; ++ res->sectors += sectors; ++ } ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ return ret; ++} ++ ++#else ++ ++static inline void __bch2_quota_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res) {} ++ ++static inline void bch2_quota_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res) {} ++ ++static inline int bch2_quota_reservation_add(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res, ++ unsigned sectors, ++ bool check_enospc) ++{ ++ return 0; ++} ++ ++#endif ++ ++void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *, ++ struct quota_res *, s64); ++ ++static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, ++ struct quota_res *quota_res, s64 sectors) ++{ ++ if (sectors) { ++ mutex_lock(&inode->ei_quota_lock); ++ __bch2_i_sectors_acct(c, inode, quota_res, sectors); ++ mutex_unlock(&inode->ei_quota_lock); ++ } ++} ++ ++static inline struct address_space *faults_disabled_mapping(void) ++{ ++ return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); ++} ++ ++static inline void set_fdm_dropped_locks(void) ++{ ++ current->faults_disabled_mapping = ++ (void *) (((unsigned long) current->faults_disabled_mapping)|1); ++} ++ ++static inline bool fdm_dropped_locks(void) ++{ ++ return ((unsigned long) current->faults_disabled_mapping) & 1; ++} ++ ++void bch2_inode_flush_nocow_writes_async(struct bch_fs *, ++ struct bch_inode_info *, struct closure *); + +int __must_check bch2_write_inode_size(struct bch_fs *, + struct bch_inode_info *, + loff_t, unsigned); + -+int bch2_read_folio(struct file *, struct folio *); -+ -+int bch2_writepages(struct address_space *, struct writeback_control *); -+void bch2_readahead(struct readahead_control *); -+ -+int bch2_write_begin(struct file *, struct address_space *, loff_t, -+ unsigned, struct page **, void **); -+int bch2_write_end(struct file *, struct address_space *, loff_t, -+ unsigned, unsigned, struct page *, void *); -+ -+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); -+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); -+ +int bch2_fsync(struct file *, loff_t, loff_t, int); + +int bch2_truncate(struct mnt_idmap *, @@ -54920,11 +55598,6 @@ index 000000000..af9053315 + +loff_t bch2_llseek(struct file *, loff_t, int); + -+vm_fault_t bch2_page_fault(struct vm_fault *); -+vm_fault_t bch2_page_mkwrite(struct vm_fault *); -+void bch2_invalidate_folio(struct folio *, size_t, size_t); -+bool bch2_release_folio(struct folio *, gfp_t); -+ +void bch2_fs_fsio_exit(struct bch_fs *); +int bch2_fs_fsio_init(struct bch_fs *); +#else @@ -54935,10 +55608,10 @@ index 000000000..af9053315 +#endif /* _BCACHEFS_FS_IO_H */ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c new file mode 100644 -index 000000000..dfa1bf73c +index 000000000..141bcced0 --- /dev/null +++ b/fs/bcachefs/fs-ioctl.c -@@ -0,0 +1,556 @@ +@@ -0,0 +1,559 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + @@ -54972,7 +55645,8 @@ index 000000000..dfa1bf73c + bool projinherit; +}; + -+static int bch2_inode_flags_set(struct bch_inode_info *inode, ++static int bch2_inode_flags_set(struct btree_trans *trans, ++ struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ @@ -55065,7 +55739,8 @@ index 000000000..dfa1bf73c + return copy_to_user(arg, &fa, sizeof(fa)); +} + -+static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, ++static int fssetxattr_inode_update_fn(struct btree_trans *trans, ++ struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ @@ -55076,7 +55751,7 @@ index 000000000..dfa1bf73c + bi->bi_project = s->projid; + } + -+ return bch2_inode_flags_set(inode, bi, p); ++ return bch2_inode_flags_set(trans, inode, bi, p); +} + +static int bch2_ioc_fssetxattr(struct bch_fs *c, @@ -55133,7 +55808,8 @@ index 000000000..dfa1bf73c + return ret; +} + -+static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, ++static int bch2_reinherit_attrs_fn(struct btree_trans *trans, ++ struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ @@ -55584,10 +56260,10 @@ index 000000000..f201980ef +#endif /* _BCACHEFS_FS_IOCTL_H */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c new file mode 100644 -index 000000000..8d2f388b4 +index 000000000..80dcda43e --- /dev/null +++ b/fs/bcachefs/fs.c -@@ -0,0 +1,1943 @@ +@@ -0,0 +1,1961 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + @@ -55604,12 +56280,16 @@ index 000000000..8d2f388b4 +#include "fs-common.h" +#include "fs-io.h" +#include "fs-ioctl.h" ++#include "fs-io-buffered.h" ++#include "fs-io-direct.h" ++#include "fs-io-pagecache.h" +#include "fsck.h" +#include "inode.h" +#include "io.h" +#include "journal.h" +#include "keylist.h" +#include "quota.h" ++#include "snapshot.h" +#include "super.h" +#include "xattr.h" + @@ -55679,7 +56359,7 @@ index 000000000..8d2f388b4 + + ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT) ?: -+ (set ? set(inode, &inode_u, p) : 0) ?: ++ (set ? set(&trans, inode, &inode_u, p) : 0) ?: + bch2_inode_write(&trans, &iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); + @@ -55793,7 +56473,7 @@ index 000000000..8d2f388b4 + + if (ret) { + iget_failed(&inode->v); -+ return ERR_PTR(ret); ++ return ERR_PTR(bch2_err_class(ret)); + } + + mutex_lock(&c->vfs_inodes_lock); @@ -56590,11 +57270,16 @@ index 000000000..8d2f388b4 +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ int ret; + + if (!dir_emit_dots(file, ctx)) + return 0; + -+ return bch2_readdir(c, inode_inum(inode), ctx); ++ ret = bch2_readdir(c, inode_inum(inode), ctx); ++ if (ret) ++ bch_err_fn(c, ret); ++ ++ return bch2_err_class(ret); +} + +static const struct file_operations bch_file_operations = { @@ -56819,7 +57504,8 @@ index 000000000..8d2f388b4 + struct bch_inode_unpacked inode_u; + subvol_inum target; + u32 snapshot; -+ unsigned name_len; ++ struct qstr dirent_name; ++ unsigned name_len = 0; + int ret; + + if (!S_ISDIR(dir->v.i_mode)) @@ -56896,9 +57582,10 @@ index 000000000..8d2f388b4 + ret = -ENOENT; + goto err; +found: -+ name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX); ++ dirent_name = bch2_dirent_get_name(d); + -+ memcpy(name, d.v->d_name, name_len); ++ name_len = min_t(unsigned, dirent_name.len, NAME_MAX); ++ memcpy(name, dirent_name.name, name_len); + name[name_len] = '\0'; +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -56996,7 +57683,8 @@ index 000000000..8d2f388b4 + call_rcu(&vinode->i_rcu, bch2_i_callback); +} + -+static int inode_update_times_fn(struct bch_inode_info *inode, ++static int inode_update_times_fn(struct btree_trans *trans, ++ struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ @@ -57484,7 +58172,10 @@ index 000000000..8d2f388b4 + return dget(sb->s_root); + +err_put_super: ++ sb->s_fs_info = NULL; ++ c->vfs_sb = NULL; + deactivate_locked_super(sb); ++ bch2_fs_stop(c); + return ERR_PTR(bch2_err_class(ret)); +} + @@ -57492,8 +58183,11 @@ index 000000000..8d2f388b4 +{ + struct bch_fs *c = sb->s_fs_info; + ++ if (c) ++ c->vfs_sb = NULL; + generic_shutdown_super(sb); -+ bch2_fs_free(c); ++ if (c) ++ bch2_fs_free(c); +} + +static struct file_system_type bcache_fs_type = { @@ -57533,10 +58227,10 @@ index 000000000..8d2f388b4 +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h new file mode 100644 -index 000000000..6170d214d +index 000000000..10e11119d --- /dev/null +++ b/fs/bcachefs/fs.h -@@ -0,0 +1,208 @@ +@@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_H +#define _BCACHEFS_FS_H @@ -57713,7 +58407,8 @@ index 000000000..6170d214d +struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); + +/* returns 0 if we want to do the update, or error is passed up */ -+typedef int (*inode_set_fn)(struct bch_inode_info *, ++typedef int (*inode_set_fn)(struct btree_trans *, ++ struct bch_inode_info *, + struct bch_inode_unpacked *, void *); + +void bch2_inode_update_after_write(struct btree_trans *, @@ -57747,10 +58442,10 @@ index 000000000..6170d214d +#endif /* _BCACHEFS_FS_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c new file mode 100644 -index 000000000..d3eb3dc1c +index 000000000..238caeeaf --- /dev/null +++ b/fs/bcachefs/fsck.c -@@ -0,0 +1,2471 @@ +@@ -0,0 +1,2483 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -57764,7 +58459,8 @@ index 000000000..d3eb3dc1c +#include "fsck.h" +#include "inode.h" +#include "keylist.h" -+#include "subvolume.h" ++#include "recovery.h" ++#include "snapshot.h" +#include "super.h" +#include "xattr.h" + @@ -57972,69 +58668,6 @@ index 000000000..d3eb3dc1c + return ret; +} + -+static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter = { NULL }; -+ struct bkey_i_inode_generation delete; -+ struct bch_inode_unpacked inode_u; -+ struct bkey_s_c k; -+ int ret; -+ -+ do { -+ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, -+ SPOS(inum, 0, snapshot), -+ SPOS(inum, U64_MAX, snapshot), -+ 0, NULL) ?: -+ bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, -+ SPOS(inum, 0, snapshot), -+ SPOS(inum, U64_MAX, snapshot), -+ 0, NULL) ?: -+ bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, -+ SPOS(inum, 0, snapshot), -+ SPOS(inum, U64_MAX, snapshot), -+ 0, NULL); -+ } while (ret == -BCH_ERR_transaction_restart_nested); -+ if (ret) -+ goto err; -+retry: -+ bch2_trans_begin(trans); -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, -+ SPOS(0, inum, snapshot), BTREE_ITER_INTENT); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (!bkey_is_inode(k.k)) { -+ bch2_fs_inconsistent(c, -+ "inode %llu:%u not found when deleting", -+ inum, snapshot); -+ ret = -EIO; -+ goto err; -+ } -+ -+ bch2_inode_unpack(k, &inode_u); -+ -+ /* Subvolume root? */ -+ if (inode_u.bi_subvol) -+ bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); -+ -+ bkey_inode_generation_init(&delete.k_i); -+ delete.k.p = iter.pos; -+ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); -+ -+ ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ return ret ?: -BCH_ERR_transaction_restart_nested; -+} -+ +static int __remove_dirent(struct btree_trans *trans, struct bpos pos) +{ + struct bch_fs *c = trans->c; @@ -58224,6 +58857,28 @@ index 000000000..d3eb3dc1c + memset(s, 0, sizeof(*s)); +} + ++static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) ++{ ++ struct snapshots_seen_entry *i, n = { ++ .id = id, ++ .equiv = bch2_snapshot_equiv(c, id), ++ }; ++ int ret = 0; ++ ++ darray_for_each(s->ids, i) { ++ if (i->id == id) ++ return 0; ++ if (i->id > id) ++ break; ++ } ++ ++ ret = darray_insert_item(&s->ids, i - s->ids.data, n); ++ if (ret) ++ bch_err(c, "error reallocating snapshots_seen table (size %zu)", ++ s->ids.size); ++ return ret; ++} ++ +static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, + enum btree_id btree_id, struct bpos pos) +{ @@ -58646,14 +59301,6 @@ index 000000000..d3eb3dc1c + if (ret) + goto err; + -+ /* -+ * if snapshot id isn't a leaf node, skip it - deletion in -+ * particular is not atomic, so on the internal snapshot nodes -+ * we can see inodes marked for deletion after a clean shutdown -+ */ -+ if (bch2_snapshot_is_internal_node(c, k.k->p.snapshot)) -+ return 0; -+ + if (!bkey_is_inode(k.k)) + return 0; + @@ -58675,6 +59322,27 @@ index 000000000..d3eb3dc1c + return -EINVAL; + } + ++ if ((u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED)) && ++ bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { ++ struct bpos new_min_pos; ++ ++ ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); ++ if (ret) ++ goto err; ++ ++ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED; ++ ++ ret = __write_inode(trans, &u, iter->pos.snapshot); ++ if (ret) { ++ bch_err_msg(c, ret, "in fsck: error updating inode"); ++ return ret; ++ } ++ ++ if (!bpos_eq(new_min_pos, POS_MIN)) ++ bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); ++ return 0; ++ } ++ + if (u.bi_flags & BCH_INODE_UNLINKED && + (!c->sb.clean || + fsck_err(c, "filesystem marked clean, but inode %llu unlinked", @@ -58682,7 +59350,7 @@ index 000000000..d3eb3dc1c + bch2_trans_unlock(trans); + bch2_fs_lazy_rw(c); + -+ ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot); ++ ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error in fsck: error while deleting inode: %s", + bch2_err_str(ret)); @@ -58753,9 +59421,10 @@ index 000000000..d3eb3dc1c + + if (do_update) { + ret = __write_inode(trans, &u, iter->pos.snapshot); -+ if (ret) -+ bch_err(c, "error in fsck: error updating inode: %s", -+ bch2_err_str(ret)); ++ if (ret) { ++ bch_err_msg(c, ret, "in fsck: error updating inode"); ++ return ret; ++ } + } +err: +fsck_err: @@ -58938,74 +59607,116 @@ index 000000000..d3eb3dc1c + +static int overlapping_extents_found(struct btree_trans *trans, + enum btree_id btree, -+ struct bpos pos1, struct bkey pos2, -+ bool *fixed) ++ struct bpos pos1, struct snapshots_seen *pos1_seen, ++ struct bkey pos2, ++ bool *fixed, ++ struct extent_end *extent_end) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ u32 snapshot = min(pos1.snapshot, pos2.p.snapshot); ++ struct btree_iter iter1, iter2 = { NULL }; ++ struct bkey_s_c k1, k2; + int ret; + + BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); + -+ bch2_trans_iter_init(trans, &iter, btree, SPOS(pos1.inode, pos1.offset - 1, snapshot), 0); -+ k = bch2_btree_iter_peek_upto(&iter, POS(pos1.inode, U64_MAX)); -+ ret = bkey_err(k); ++ bch2_trans_iter_init(trans, &iter1, btree, pos1, ++ BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_NOT_EXTENTS); ++ k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); ++ ret = bkey_err(k1); + if (ret) + goto err; + + prt_str(&buf, "\n "); -+ bch2_bkey_val_to_text(&buf, c, k); ++ bch2_bkey_val_to_text(&buf, c, k1); + -+ if (!bpos_eq(pos1, k.k->p)) { -+ bch_err(c, "%s: error finding first overlapping extent when repairing%s", ++ if (!bpos_eq(pos1, k1.k->p)) { ++ prt_str(&buf, "\n wanted\n "); ++ bch2_bpos_to_text(&buf, pos1); ++ prt_str(&buf, "\n "); ++ bch2_bkey_to_text(&buf, &pos2); ++ ++ bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", + __func__, buf.buf); + ret = -BCH_ERR_internal_fsck_err; + goto err; + } + -+ while (1) { -+ bch2_btree_iter_advance(&iter); ++ bch2_trans_copy_iter(&iter2, &iter1); + -+ k = bch2_btree_iter_peek_upto(&iter, POS(pos1.inode, U64_MAX)); -+ ret = bkey_err(k); ++ while (1) { ++ bch2_btree_iter_advance(&iter2); ++ ++ k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); ++ ret = bkey_err(k2); + if (ret) + goto err; + -+ if (bkey_ge(k.k->p, pos2.p)) ++ if (bpos_ge(k2.k->p, pos2.p)) + break; -+ + } + + prt_str(&buf, "\n "); -+ bch2_bkey_val_to_text(&buf, c, k); ++ bch2_bkey_val_to_text(&buf, c, k2); + -+ if (bkey_gt(k.k->p, pos2.p) || -+ pos2.size != k.k->size) { ++ if (bpos_gt(k2.k->p, pos2.p) || ++ pos2.size != k2.k->size) { + bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", + __func__, buf.buf); + ret = -BCH_ERR_internal_fsck_err; + goto err; + } + -+ if (fsck_err(c, "overlapping extents%s", buf.buf)) { -+ struct bpos update_pos = pos1.snapshot < pos2.p.snapshot ? pos1 : pos2.p; -+ struct btree_iter update_iter; ++ prt_printf(&buf, "\n overwriting %s extent", ++ pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); + -+ struct bkey_i *update = bch2_bkey_get_mut(trans, &update_iter, -+ btree, update_pos, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+ bch2_trans_iter_exit(trans, &update_iter); -+ if ((ret = PTR_ERR_OR_ZERO(update))) ++ if (fsck_err(c, "overlapping extents%s", buf.buf)) { ++ struct btree_iter *old_iter = &iter1; ++ struct disk_reservation res = { 0 }; ++ ++ if (pos1.snapshot < pos2.p.snapshot) { ++ old_iter = &iter2; ++ swap(k1, k2); ++ } ++ ++ trans->extra_journal_res += bch2_bkey_sectors_compressed(k2); ++ ++ ret = bch2_trans_update_extent_overwrite(trans, old_iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, ++ k1, k2) ?: ++ bch2_trans_commit(trans, &res, NULL, ++ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL); ++ bch2_disk_reservation_put(c, &res); ++ ++ if (ret) + goto err; + + *fixed = true; ++ ++ if (pos1.snapshot == pos2.p.snapshot) { ++ /* ++ * We overwrote the first extent, and did the overwrite ++ * in the same snapshot: ++ */ ++ extent_end->offset = bkey_start_offset(&pos2); ++ } else if (pos1.snapshot > pos2.p.snapshot) { ++ /* ++ * We overwrote the first extent in pos2's snapshot: ++ */ ++ ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); ++ } else { ++ /* ++ * We overwrote the second extent - restart ++ * check_extent() from the top: ++ */ ++ ret = -BCH_ERR_transaction_restart_nested; ++ } + } +fsck_err: +err: -+ bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(trans, &iter2); ++ bch2_trans_iter_exit(trans, &iter1); + printbuf_exit(&buf); + return ret; +} @@ -59015,11 +59726,11 @@ index 000000000..d3eb3dc1c + struct extent_ends *extent_ends, + struct bkey_s_c k, + u32 equiv, -+ struct btree_iter *iter) ++ struct btree_iter *iter, ++ bool *fixed) +{ + struct bch_fs *c = trans->c; + struct extent_end *i; -+ bool fixed = false; + int ret = 0; + + /* transaction restart, running again */ @@ -59042,7 +59753,8 @@ index 000000000..d3eb3dc1c + SPOS(iter->pos.inode, + i->offset, + i->snapshot), -+ *k.k, &fixed); ++ &i->seen, ++ *k.k, fixed, i); + if (ret) + goto err; + } @@ -59053,7 +59765,7 @@ index 000000000..d3eb3dc1c + + extent_ends->last_pos = k.k->p; +err: -+ return ret ?: fixed; ++ return ret; +} + +static int check_extent(struct btree_trans *trans, struct btree_iter *iter, @@ -59108,13 +59820,10 @@ index 000000000..d3eb3dc1c + goto delete; + + ret = check_overlapping_extents(trans, s, extent_ends, k, -+ equiv.snapshot, iter); -+ if (ret < 0) -+ goto err; -+ ++ equiv.snapshot, iter, ++ &inode->recalculate_sums); + if (ret) -+ inode->recalculate_sums = true; -+ ret = 0; ++ goto err; + } + + /* @@ -59189,7 +59898,7 @@ index 000000000..d3eb3dc1c + + snapshots_seen_init(&s); + extent_ends_init(&extent_ends); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); + + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents, + POS(BCACHEFS_ROOT_INO, 0), @@ -59894,8 +60603,6 @@ index 000000000..d3eb3dc1c + return ret; +} + -+/* check_nlink pass: */ -+ +struct nlink_table { + size_t nr; + size_t size; @@ -60244,14 +60951,15 @@ index 000000000..90c87b508 +#endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c new file mode 100644 -index 000000000..8834809d4 +index 000000000..8114b6e4f --- /dev/null +++ b/fs/bcachefs/inode.c -@@ -0,0 +1,925 @@ +@@ -0,0 +1,1111 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_key_cache.h" ++#include "btree_write_buffer.h" +#include "bkey_methods.h" +#include "btree_update.h" +#include "buckets.h" @@ -60260,6 +60968,7 @@ index 000000000..8834809d4 +#include "extent_update.h" +#include "inode.h" +#include "str_hash.h" ++#include "snapshot.h" +#include "subvolume.h" +#include "varint.h" + @@ -60597,6 +61306,8 @@ index 000000000..8834809d4 + return 0; +err: + bch2_trans_iter_exit(trans, iter); ++ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum); + return ret; +} + @@ -60769,6 +61480,25 @@ index 000000000..8834809d4 + __bch2_inode_unpacked_to_text(out, &inode); +} + ++static inline u64 bkey_inode_flags(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_inode: ++ return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); ++ case KEY_TYPE_inode_v2: ++ return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); ++ case KEY_TYPE_inode_v3: ++ return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); ++ default: ++ return 0; ++ } ++} ++ ++static inline bool bkey_is_deleted_inode(struct bkey_s_c k) ++{ ++ return bkey_inode_flags(k) & BCH_INODE_UNLINKED; ++} ++ +int bch2_trans_mark_inode(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, @@ -60776,6 +61506,8 @@ index 000000000..8834809d4 + unsigned flags) +{ + int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); ++ bool old_deleted = bkey_is_deleted_inode(old); ++ bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new)); + + if (nr) { + int ret = bch2_replicas_deltas_realloc(trans, 0); @@ -60787,6 +61519,12 @@ index 000000000..8834809d4 + d->nr_inodes += nr; + } + ++ if (old_deleted != new_deleted) { ++ int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted); ++ if (ret) ++ return ret; ++ } ++ + return 0; +} + @@ -61173,12 +61911,167 @@ index 000000000..8834809d4 + if (opts->nocow) + opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; +} ++ ++int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter = { NULL }; ++ struct bkey_i_inode_generation delete; ++ struct bch_inode_unpacked inode_u; ++ struct bkey_s_c k; ++ int ret; ++ ++ do { ++ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, ++ SPOS(inum, 0, snapshot), ++ SPOS(inum, U64_MAX, snapshot), ++ 0, NULL) ?: ++ bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, ++ SPOS(inum, 0, snapshot), ++ SPOS(inum, U64_MAX, snapshot), ++ 0, NULL) ?: ++ bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, ++ SPOS(inum, 0, snapshot), ++ SPOS(inum, U64_MAX, snapshot), ++ 0, NULL); ++ } while (ret == -BCH_ERR_transaction_restart_nested); ++ if (ret) ++ goto err; ++retry: ++ bch2_trans_begin(trans); ++ ++ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, ++ SPOS(0, inum, snapshot), BTREE_ITER_INTENT); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (!bkey_is_inode(k.k)) { ++ bch2_fs_inconsistent(c, ++ "inode %llu:%u not found when deleting", ++ inum, snapshot); ++ ret = -EIO; ++ goto err; ++ } ++ ++ bch2_inode_unpack(k, &inode_u); ++ ++ /* Subvolume root? */ ++ if (inode_u.bi_subvol) ++ bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); ++ ++ bkey_inode_generation_init(&delete.k_i); ++ delete.k.p = iter.pos; ++ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); ++ ++ ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto retry; ++ ++ return ret ?: -BCH_ERR_transaction_restart_nested; ++} ++ ++static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bch_inode_unpacked inode; ++ int ret; ++ ++ if (bch2_snapshot_is_internal_node(c, pos.snapshot)) ++ return 0; ++ ++ if (!fsck_err_on(c->sb.clean, c, ++ "filesystem marked as clean but have deleted inode %llu:%u", ++ pos.offset, pos.snapshot)) ++ return 0; ++ ++ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; ++ if (fsck_err_on(!bkey_is_inode(k.k), c, ++ "nonexistent inode %llu:%u in deleted_inodes btree", ++ pos.offset, pos.snapshot)) ++ goto delete; ++ ++ ret = bch2_inode_unpack(k, &inode); ++ if (ret) ++ goto err; ++ ++ if (fsck_err_on(S_ISDIR(inode.bi_mode), c, ++ "directory %llu:%u in deleted_inodes btree", ++ pos.offset, pos.snapshot)) ++ goto delete; ++ ++ if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c, ++ "non-deleted inode %llu:%u in deleted_inodes btree", ++ pos.offset, pos.snapshot)) ++ goto delete; ++ ++ return 1; ++err: ++fsck_err: ++ return ret; ++delete: ++ return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); ++} ++ ++int bch2_delete_dead_inodes(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = bch2_btree_write_buffer_flush_sync(&trans); ++ if (ret) ++ goto err; ++ ++ /* ++ * Weird transaction restart handling here because on successful delete, ++ * bch2_inode_rm_snapshot() will return a nested transaction restart, ++ * but we can't retry because the btree write buffer won't have been ++ * flushed and we'd spin: ++ */ ++ for_each_btree_key(&trans, iter, BTREE_ID_deleted_inodes, POS_MIN, ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { ++ ret = lockrestart_do(&trans, may_delete_deleted_inode(&trans, k.k->p)); ++ if (ret < 0) ++ break; ++ ++ if (ret) { ++ if (!test_bit(BCH_FS_RW, &c->flags)) { ++ bch2_trans_unlock(&trans); ++ bch2_fs_lazy_rw(c); ++ } ++ ++ ret = bch2_inode_rm_snapshot(&trans, k.k->p.offset, k.k->p.snapshot); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ break; ++ } ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++err: ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h new file mode 100644 -index 000000000..7809d1b6d +index 000000000..22b244056 --- /dev/null +++ b/fs/bcachefs/inode.h -@@ -0,0 +1,201 @@ +@@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_INODE_H +#define _BCACHEFS_INODE_H @@ -61379,13 +62272,16 @@ index 000000000..7809d1b6d +void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, + struct bch_inode_unpacked *); + ++int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); ++int bch2_delete_dead_inodes(struct bch_fs *); ++ +#endif /* _BCACHEFS_INODE_H */ diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c new file mode 100644 -index 000000000..5bacc6a9d +index 000000000..3c614c864 --- /dev/null +++ b/fs/bcachefs/io.c -@@ -0,0 +1,3059 @@ +@@ -0,0 +1,3051 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some low level IO code, and hacks for various block layer limitations @@ -61768,10 +62664,10 @@ index 000000000..5bacc6a9d + struct bch_fs *c = trans->c; + struct disk_reservation disk_res = { 0 }; + struct closure cl; -+ struct open_buckets open_buckets; ++ struct open_buckets open_buckets = { 0 }; + struct bkey_s_c k; + struct bkey_buf old, new; -+ unsigned sectors_allocated; ++ unsigned sectors_allocated = 0; + bool have_reservation = false; + bool unwritten = opts.nocow && + c->sb.version >= bcachefs_metadata_version_unwritten_extents; @@ -61780,9 +62676,6 @@ index 000000000..5bacc6a9d + bch2_bkey_buf_init(&old); + bch2_bkey_buf_init(&new); + closure_init_stack(&cl); -+ open_buckets.nr = 0; -+retry: -+ sectors_allocated = 0; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); @@ -61801,14 +62694,14 @@ index 000000000..5bacc6a9d + */ + ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); + if (unlikely(ret)) -+ goto out; ++ goto err; + + bch2_bkey_buf_reassemble(&old, c, k); + } + + if (have_reservation) { + if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) -+ goto out; ++ goto err; + + bch2_key_resize(&new.k->k, sectors); + } else if (!unwritten) { @@ -61840,13 +62733,10 @@ index 000000000..5bacc6a9d + opts.data_replicas, + opts.data_replicas, + BCH_WATERMARK_normal, 0, &cl, &wp); -+ if (ret) { -+ bch2_trans_unlock(trans); -+ closure_sync(&cl); -+ if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) -+ goto retry; -+ return ret; -+ } ++ if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) ++ ret = -BCH_ERR_transaction_restart_nested; ++ if (ret) ++ goto err; + + sectors = min(sectors, wp->sectors_free); + sectors_allocated = sectors; @@ -61865,17 +62755,7 @@ index 000000000..5bacc6a9d + + ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, + 0, i_sectors_delta, true); -+out: -+ if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) { -+ bch2_trans_unlock(trans); -+ closure_sync(&cl); -+ } -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { -+ bch2_trans_begin(trans); -+ goto retry; -+ } -+ ++err: + if (!ret && sectors_allocated) + bch2_increment_clock(c, sectors_allocated, WRITE); + @@ -61884,6 +62764,11 @@ index 000000000..5bacc6a9d + bch2_bkey_buf_exit(&new, c); + bch2_bkey_buf_exit(&old, c); + ++ if (closure_nr_remaining(&cl) != 1) { ++ bch2_trans_unlock(trans); ++ closure_sync(&cl); ++ } ++ + return ret; +} + @@ -62098,13 +62983,15 @@ index 000000000..5bacc6a9d + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_fs *c = op->c; + ++ EBUG_ON(op->open_buckets.nr); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + bch2_disk_reservation_put(c, &op->res); ++ + if (!(op->flags & BCH_WRITE_MOVE)) + bch2_write_ref_put(c, BCH_WRITE_REF_write); + bch2_keylist_free(&op->insert_keys, op->inline_keys); + -+ bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); -+ + EBUG_ON(cl->parent); + closure_debug_destroy(cl); + if (op->end_io) @@ -63823,6 +64710,7 @@ index 000000000..5bacc6a9d + + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; ++ + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + } + } @@ -64447,7 +65335,7 @@ index 000000000..5bacc6a9d +} diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h new file mode 100644 -index 000000000..1476380d5 +index 000000000..831e3f1b7 --- /dev/null +++ b/fs/bcachefs/io.h @@ -0,0 +1,202 @@ @@ -64505,7 +65393,7 @@ index 000000000..1476380d5 +}; + +enum bch_write_flags { -+#define x(f) BCH_WRITE_##f = 1U << __BCH_WRITE_##f, ++#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), + BCH_WRITE_FLAGS() +#undef x +}; @@ -64826,7 +65714,7 @@ index 000000000..737f16d78 +#endif /* _BCACHEFS_IO_TYPES_H */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c new file mode 100644 -index 000000000..80a612c05 +index 000000000..055920c26 --- /dev/null +++ b/fs/bcachefs/journal.c @@ -0,0 +1,1438 @@ @@ -64895,6 +65783,7 @@ index 000000000..80a612c05 +static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) +{ + unsigned i; ++ + for (i = 0; i < ARRAY_SIZE(p->list); i++) + INIT_LIST_HEAD(&p->list[i]); + INIT_LIST_HEAD(&p->flushed); @@ -65346,8 +66235,7 @@ index 000000000..80a612c05 + int ret; + + closure_wait_event(&j->async_wait, -+ (ret = __journal_res_get(j, res, flags)) != -+ -BCH_ERR_journal_res_get_blocked|| ++ (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + (flags & JOURNAL_RES_GET_NONBLOCK)); + return ret; +} @@ -66802,10 +67690,10 @@ index 000000000..008a2e25a +#endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c new file mode 100644 -index 000000000..f861ae2f1 +index 000000000..34740dca4 --- /dev/null +++ b/fs/bcachefs/journal_io.c -@@ -0,0 +1,1863 @@ +@@ -0,0 +1,1888 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" @@ -66822,6 +67710,7 @@ index 000000000..f861ae2f1 +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" +#include "replicas.h" ++#include "sb-clean.h" +#include "trace.h" + +static struct nonce journal_nonce(const struct jset *jset) @@ -67016,33 +67905,41 @@ index 000000000..f861ae2f1 +#define JOURNAL_ENTRY_BAD 7 + +static void journal_entry_err_msg(struct printbuf *out, ++ u32 version, + struct jset *jset, + struct jset_entry *entry) +{ -+ prt_str(out, "invalid journal entry "); -+ if (entry) -+ prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]); ++ prt_str(out, "invalid journal entry, version="); ++ bch2_version_to_text(out, version); ++ ++ if (entry) { ++ prt_str(out, " type="); ++ prt_str(out, bch2_jset_entry_types[entry->type]); ++ } ++ ++ if (!jset) { ++ prt_printf(out, " in superblock"); ++ } else { ++ ++ prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); ++ ++ if (entry) ++ prt_printf(out, " offset=%zi/%u", ++ (u64 *) entry - jset->_data, ++ le32_to_cpu(jset->u64s)); ++ } + -+ if (!jset) -+ prt_printf(out, "in superblock"); -+ else if (!entry) -+ prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq)); -+ else -+ prt_printf(out, "at offset %zi/%u seq %llu", -+ (u64 *) entry - jset->_data, -+ le32_to_cpu(jset->u64s), -+ le64_to_cpu(jset->seq)); + prt_str(out, ": "); +} + -+#define journal_entry_err(c, jset, entry, msg, ...) \ ++#define journal_entry_err(c, version, jset, entry, msg, ...) \ +({ \ + struct printbuf buf = PRINTBUF; \ + \ -+ journal_entry_err_msg(&buf, jset, entry); \ ++ journal_entry_err_msg(&buf, version, jset, entry); \ + prt_printf(&buf, msg, ##__VA_ARGS__); \ + \ -+ switch (write) { \ ++ switch (flags & BKEY_INVALID_WRITE) { \ + case READ: \ + mustfix_fsck_err(c, "%s", buf.buf); \ + break; \ @@ -67059,8 +67956,8 @@ index 000000000..f861ae2f1 + true; \ +}) + -+#define journal_entry_err_on(cond, c, jset, entry, msg, ...) \ -+ ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false) ++#define journal_entry_err_on(cond, c, version, jset, entry, msg, ...) \ ++ ((cond) ? journal_entry_err(c, version, jset, entry, msg, ##__VA_ARGS__) : false) + +#define FSCK_DELETED_KEY 5 + @@ -67069,13 +67966,15 @@ index 000000000..f861ae2f1 + struct jset_entry *entry, + unsigned level, enum btree_id btree_id, + struct bkey_i *k, -+ unsigned version, int big_endian, int write) ++ unsigned version, int big_endian, ++ enum bkey_invalid_flags flags) +{ ++ int write = flags & BKEY_INVALID_WRITE; + void *next = vstruct_next(entry); + struct printbuf buf = PRINTBUF; + int ret = 0; + -+ if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) { ++ if (journal_entry_err_on(!k->k.u64s, c, version, jset, entry, "k->u64s 0")) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); + return FSCK_DELETED_KEY; @@ -67083,7 +67982,7 @@ index 000000000..f861ae2f1 + + if (journal_entry_err_on((void *) bkey_next(k) > + (void *) vstruct_next(entry), -+ c, jset, entry, ++ c, version, jset, entry, + "extends past end of journal entry")) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); @@ -67091,7 +67990,7 @@ index 000000000..f861ae2f1 + } + + if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, -+ c, jset, entry, ++ c, version, jset, entry, + "bad format %u", k->k.format)) { + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); @@ -67106,11 +68005,7 @@ index 000000000..f861ae2f1 + if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id), write, &buf)) { + printbuf_reset(&buf); -+ prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:", -+ bch2_jset_entry_types[entry->type], -+ (u64 *) entry - jset->_data, -+ le32_to_cpu(jset->u64s), -+ le64_to_cpu(jset->seq)); ++ journal_entry_err_msg(&buf, version, jset, entry); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + @@ -67138,9 +68033,10 @@ index 000000000..f861ae2f1 +} + +static int journal_entry_btree_keys_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, int write) ++ struct jset *jset, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, ++ enum bkey_invalid_flags flags) +{ + struct bkey_i *k = entry->start; + @@ -67149,7 +68045,7 @@ index 000000000..f861ae2f1 + entry->level, + entry->btree_id, + k, version, big_endian, -+ write|BKEY_INVALID_JOURNAL); ++ flags|BKEY_INVALID_JOURNAL); + if (ret == FSCK_DELETED_KEY) + continue; + @@ -67177,16 +68073,17 @@ index 000000000..f861ae2f1 +} + +static int journal_entry_btree_root_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, int write) ++ struct jset *jset, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, ++ enum bkey_invalid_flags flags) +{ + struct bkey_i *k = entry->start; + int ret = 0; + + if (journal_entry_err_on(!entry->u64s || + le16_to_cpu(entry->u64s) != k->k.u64s, -+ c, jset, entry, ++ c, version, jset, entry, + "invalid btree root journal entry: wrong number of keys")) { + void *next = vstruct_next(entry); + /* @@ -67200,7 +68097,7 @@ index 000000000..f861ae2f1 + } + + return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, -+ version, big_endian, write); ++ version, big_endian, flags); +fsck_err: + return ret; +} @@ -67212,9 +68109,10 @@ index 000000000..f861ae2f1 +} + +static int journal_entry_prio_ptrs_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, int write) ++ struct jset *jset, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, ++ enum bkey_invalid_flags flags) +{ + /* obsolete, don't care: */ + return 0; @@ -67226,14 +68124,15 @@ index 000000000..f861ae2f1 +} + +static int journal_entry_blacklist_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, int write) ++ struct jset *jset, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, ++ enum bkey_invalid_flags flags) +{ + int ret = 0; + + if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, -+ c, jset, entry, ++ c, version, jset, entry, + "invalid journal seq blacklist entry: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + } @@ -67251,15 +68150,16 @@ index 000000000..f861ae2f1 +} + +static int journal_entry_blacklist_v2_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, int write) ++ struct jset *jset, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, ++ enum bkey_invalid_flags flags) +{ + struct jset_entry_blacklist_v2 *bl_entry; + int ret = 0; + + if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, -+ c, jset, entry, ++ c, version, jset, entry, + "invalid journal seq blacklist entry: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + goto out; @@ -67269,7 +68169,7 @@ index 000000000..f861ae2f1 + + if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > + le64_to_cpu(bl_entry->end), -+ c, jset, entry, ++ c, version, jset, entry, + "invalid journal seq blacklist entry: start > end")) { + journal_entry_null_range(entry, vstruct_next(entry)); + } @@ -67290,9 +68190,10 @@ index 000000000..f861ae2f1 +} + +static int journal_entry_usage_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, int write) ++ struct jset *jset, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, ++ enum bkey_invalid_flags flags) +{ + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); @@ -67300,7 +68201,7 @@ index 000000000..f861ae2f1 + int ret = 0; + + if (journal_entry_err_on(bytes < sizeof(*u), -+ c, jset, entry, ++ c, version, jset, entry, + "invalid journal entry usage: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; @@ -67322,9 +68223,10 @@ index 000000000..f861ae2f1 +} + +static int journal_entry_data_usage_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, int write) ++ struct jset *jset, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, ++ enum bkey_invalid_flags flags) +{ + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); @@ -67333,7 +68235,7 @@ index 000000000..f861ae2f1 + + if (journal_entry_err_on(bytes < sizeof(*u) || + bytes < sizeof(*u) + u->r.nr_devs, -+ c, jset, entry, ++ c, version, jset, entry, + "invalid journal entry usage: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; @@ -67354,9 +68256,10 @@ index 000000000..f861ae2f1 +} + +static int journal_entry_clock_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, int write) ++ struct jset *jset, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, ++ enum bkey_invalid_flags flags) +{ + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); @@ -67364,13 +68267,13 @@ index 000000000..f861ae2f1 + int ret = 0; + + if (journal_entry_err_on(bytes != sizeof(*clock), -+ c, jset, entry, "bad size")) { ++ c, version, jset, entry, "bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(clock->rw > 1, -+ c, jset, entry, "bad rw")) { ++ c, version, jset, entry, "bad rw")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } @@ -67389,9 +68292,10 @@ index 000000000..f861ae2f1 +} + +static int journal_entry_dev_usage_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, int write) ++ struct jset *jset, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, ++ enum bkey_invalid_flags flags) +{ + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); @@ -67401,7 +68305,7 @@ index 000000000..f861ae2f1 + int ret = 0; + + if (journal_entry_err_on(bytes < expected, -+ c, jset, entry, "bad size (%u < %u)", ++ c, version, jset, entry, "bad size (%u < %u)", + bytes, expected)) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; @@ -67410,13 +68314,13 @@ index 000000000..f861ae2f1 + dev = le32_to_cpu(u->dev); + + if (journal_entry_err_on(!bch2_dev_exists2(c, dev), -+ c, jset, entry, "bad dev")) { ++ c, version, jset, entry, "bad dev")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(u->pad, -+ c, jset, entry, "bad pad")) { ++ c, version, jset, entry, "bad pad")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } @@ -67449,9 +68353,10 @@ index 000000000..f861ae2f1 +} + +static int journal_entry_log_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, int write) ++ struct jset *jset, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, ++ enum bkey_invalid_flags flags) +{ + return 0; +} @@ -67466,9 +68371,10 @@ index 000000000..f861ae2f1 +} + +static int journal_entry_overwrite_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, int write) ++ struct jset *jset, ++ struct jset_entry *entry, ++ unsigned version, int big_endian, ++ enum bkey_invalid_flags flags) +{ + return journal_entry_btree_keys_validate(c, jset, entry, + version, big_endian, READ); @@ -67482,7 +68388,8 @@ index 000000000..f861ae2f1 + +struct jset_entry_ops { + int (*validate)(struct bch_fs *, struct jset *, -+ struct jset_entry *, unsigned, int, int); ++ struct jset_entry *, unsigned, int, ++ enum bkey_invalid_flags); + void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); +}; + @@ -67499,11 +68406,12 @@ index 000000000..f861ae2f1 +int bch2_journal_entry_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, -+ unsigned version, int big_endian, int write) ++ unsigned version, int big_endian, ++ enum bkey_invalid_flags flags) +{ + return entry->type < BCH_JSET_ENTRY_NR + ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, -+ version, big_endian, write) ++ version, big_endian, flags) + : 0; +} + @@ -67519,22 +68427,22 @@ index 000000000..f861ae2f1 +} + +static int jset_validate_entries(struct bch_fs *c, struct jset *jset, -+ int write) ++ enum bkey_invalid_flags flags) +{ + struct jset_entry *entry; ++ unsigned version = le32_to_cpu(jset->version); + int ret = 0; + + vstruct_for_each(jset, entry) { -+ if (journal_entry_err_on(vstruct_next(entry) > -+ vstruct_last(jset), c, jset, entry, ++ if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), ++ c, version, jset, entry, + "journal entry extends past end of jset")) { + jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); + break; + } + + ret = bch2_journal_entry_validate(c, jset, entry, -+ le32_to_cpu(jset->version), -+ JSET_BIG_ENDIAN(jset), write); ++ version, JSET_BIG_ENDIAN(jset), flags); + if (ret) + break; + } @@ -67545,7 +68453,7 @@ index 000000000..f861ae2f1 +static int jset_validate(struct bch_fs *c, + struct bch_dev *ca, + struct jset *jset, u64 sector, -+ int write) ++ enum bkey_invalid_flags flags) +{ + unsigned version; + int ret = 0; @@ -67554,7 +68462,8 @@ index 000000000..f861ae2f1 + return JOURNAL_ENTRY_NONE; + + version = le32_to_cpu(jset->version); -+ if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL, ++ if (journal_entry_err_on(!bch2_version_compatible(version), ++ c, version, jset, NULL, + "%s sector %llu seq %llu: incompatible journal entry version %u.%u", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), @@ -67565,7 +68474,7 @@ index 000000000..f861ae2f1 + } + + if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), -+ c, jset, NULL, ++ c, version, jset, NULL, + "%s sector %llu seq %llu: journal entry with unknown csum type %llu", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), @@ -67575,7 +68484,7 @@ index 000000000..f861ae2f1 + /* last_seq is ignored when JSET_NO_FLUSH is true */ + if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && + le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), -+ c, jset, NULL, ++ c, version, jset, NULL, + "invalid journal entry: last_seq > seq (%llu > %llu)", + le64_to_cpu(jset->last_seq), + le64_to_cpu(jset->seq))) { @@ -67583,7 +68492,7 @@ index 000000000..f861ae2f1 + return JOURNAL_ENTRY_BAD; + } + -+ ret = jset_validate_entries(c, jset, write); ++ ret = jset_validate_entries(c, jset, flags); +fsck_err: + return ret; +} @@ -67596,14 +68505,15 @@ index 000000000..f861ae2f1 +{ + size_t bytes = vstruct_bytes(jset); + unsigned version; -+ int write = READ; ++ enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; + int ret = 0; + + if (le64_to_cpu(jset->magic) != jset_magic(c)) + return JOURNAL_ENTRY_NONE; + + version = le32_to_cpu(jset->version); -+ if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL, ++ if (journal_entry_err_on(!bch2_version_compatible(version), ++ c, version, jset, NULL, + "%s sector %llu seq %llu: unknown journal entry version %u.%u", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), @@ -67618,7 +68528,7 @@ index 000000000..f861ae2f1 + return JOURNAL_ENTRY_REREAD; + + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, -+ c, jset, NULL, ++ c, version, jset, NULL, + "%s sector %llu seq %llu: journal entry too big (%zu bytes)", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), bytes)) @@ -67839,6 +68749,7 @@ index 000000000..f861ae2f1 + bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); + for (i = 0; i < 3; i++) { + unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; ++ + bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); + } + ja->sectors_free = 0; @@ -67935,7 +68846,7 @@ index 000000000..f861ae2f1 + * those entries will be blacklisted: + */ + genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { -+ int write = READ; ++ enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; + + i = *_i; + @@ -67957,7 +68868,7 @@ index 000000000..f861ae2f1 + } + + if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), -+ c, &i->j, NULL, ++ c, le32_to_cpu(i->j.version), &i->j, NULL, + "invalid journal entry: last_seq > seq (%llu > %llu)", + le64_to_cpu(i->j.last_seq), + le64_to_cpu(i->j.seq))) @@ -68089,18 +69000,14 @@ index 000000000..f861ae2f1 + + bch2_replicas_entry_sort(&replicas.e); + -+ /* -+ * If we're mounting in degraded mode - if we didn't read all -+ * the devices - this is wrong: -+ */ -+ + printbuf_reset(&buf); + bch2_replicas_entry_to_text(&buf, &replicas.e); + + if (!degraded && -+ fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, -+ "superblock not marked as containing replicas %s", -+ buf.buf)) { ++ !bch2_replicas_marked(c, &replicas.e) && ++ (le64_to_cpu(i->j.seq) == *last_seq || ++ fsck_err(c, "superblock not marked as containing replicas for journal entry %llu\n %s", ++ le64_to_cpu(i->j.seq), buf.buf))) { + ret = bch2_mark_replicas(c, &replicas.e); + if (ret) + goto err; @@ -68267,6 +69174,7 @@ index 000000000..f861ae2f1 + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *w = journal_last_unwritten_buf(j); ++ struct bch_replicas_padded replicas; + union journal_res_state old, new; + u64 v, seq; + int err = 0; @@ -68278,7 +69186,13 @@ index 000000000..f861ae2f1 + if (!w->devs_written.nr) { + bch_err(c, "unable to write journal to sufficient devices"); + err = -EIO; ++ } else { ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, ++ w->devs_written); ++ if (bch2_mark_replicas(c, &replicas.e)) ++ err = -EIO; + } ++ + if (err) + bch2_fatal_error(c); + @@ -68415,7 +69329,6 @@ index 000000000..f861ae2f1 + } + + continue_at(cl, journal_write_done, c->io_complete_wq); -+ return; +} + +static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset) @@ -68671,10 +69584,10 @@ index 000000000..f861ae2f1 +} diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h new file mode 100644 -index 000000000..8801e9810 +index 000000000..a88d097b1 --- /dev/null +++ b/fs/bcachefs/journal_io.h -@@ -0,0 +1,64 @@ +@@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_IO_H +#define _BCACHEFS_JOURNAL_IO_H @@ -68727,7 +69640,8 @@ index 000000000..8801e9810 + jset_entry_for_each_key(entry, k) + +int bch2_journal_entry_validate(struct bch_fs *, struct jset *, -+ struct jset_entry *, unsigned, int, int); ++ struct jset_entry *, unsigned, int, ++ enum bkey_invalid_flags); +void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, + struct jset_entry *); + @@ -68741,22 +69655,23 @@ index 000000000..8801e9810 +#endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c new file mode 100644 -index 000000000..8de83e103 +index 000000000..10e1860da --- /dev/null +++ b/fs/bcachefs/journal_reclaim.c -@@ -0,0 +1,873 @@ +@@ -0,0 +1,874 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_key_cache.h" +#include "btree_update.h" ++#include "buckets.h" +#include "errcode.h" +#include "error.h" +#include "journal.h" +#include "journal_io.h" +#include "journal_reclaim.h" +#include "replicas.h" -+#include "super.h" ++#include "sb-members.h" +#include "trace.h" + +#include @@ -69092,7 +70007,7 @@ index 000000000..8de83e103 + list_del_init(&pin->list); + + /* -+ * Unpinning a journal entry make make journal_next_bucket() succeed, if ++ * Unpinning a journal entry may make journal_next_bucket() succeed, if + * writing a new last_seq will now make another bucket available: + */ + return atomic_dec_and_test(&pin_list->count) && @@ -71278,10 +72193,10 @@ index 000000000..027efaa0d +#endif /* _BCACHEFS_MIGRATE_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c new file mode 100644 -index 000000000..052726739 +index 000000000..fb76a1dac --- /dev/null +++ b/fs/bcachefs/move.c -@@ -0,0 +1,1168 @@ +@@ -0,0 +1,1162 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -72387,46 +73302,40 @@ index 000000000..052726739 + return ret; +} + -+void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct bch_move_stats *stats; -+ -+ mutex_lock(&c->data_progress_lock); -+ list_for_each_entry(stats, &c->data_progress_list, list) { -+ prt_printf(out, "%s: data type %s btree_id %s position: ", -+ stats->name, -+ bch2_data_types[stats->data_type], -+ bch2_btree_ids[stats->btree_id]); -+ bch2_bpos_to_text(out, stats->pos); -+ prt_printf(out, "%s", "\n"); -+ } -+ mutex_unlock(&c->data_progress_lock); -+} -+ -+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt) ++static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) +{ ++ struct bch_move_stats *stats = ctxt->stats; + struct moving_io *io; + -+ prt_printf(out, "%ps:", ctxt->fn); ++ prt_printf(out, "%s (%ps):", stats->name, ctxt->fn); ++ prt_newline(out); ++ ++ prt_printf(out, " data type %s btree_id %s position: ", ++ bch2_data_types[stats->data_type], ++ bch2_btree_ids[stats->btree_id]); ++ bch2_bpos_to_text(out, stats->pos); + prt_newline(out); + printbuf_indent_add(out, 2); + -+ prt_printf(out, "reads: %u sectors %u", ++ prt_printf(out, "reads: ios %u/%u sectors %u/%u", + atomic_read(&ctxt->read_ios), -+ atomic_read(&ctxt->read_sectors)); ++ c->opts.move_ios_in_flight, ++ atomic_read(&ctxt->read_sectors), ++ c->opts.move_bytes_in_flight >> 9); + prt_newline(out); + -+ prt_printf(out, "writes: %u sectors %u", ++ prt_printf(out, "writes: ios %u/%u sectors %u/%u", + atomic_read(&ctxt->write_ios), -+ atomic_read(&ctxt->write_sectors)); ++ c->opts.move_ios_in_flight, ++ atomic_read(&ctxt->write_sectors), ++ c->opts.move_bytes_in_flight >> 9); + prt_newline(out); + + printbuf_indent_add(out, 2); + + mutex_lock(&ctxt->lock); -+ list_for_each_entry(io, &ctxt->ios, io_list) { ++ list_for_each_entry(io, &ctxt->ios, io_list) + bch2_write_op_to_text(out, &io->write.op); -+ } + mutex_unlock(&ctxt->lock); + + printbuf_indent_sub(out, 4); @@ -72438,7 +73347,7 @@ index 000000000..052726739 + + mutex_lock(&c->moving_context_lock); + list_for_each_entry(ctxt, &c->moving_context_list, list) -+ bch2_moving_ctxt_to_text(out, ctxt); ++ bch2_moving_ctxt_to_text(out, c, ctxt); + mutex_unlock(&c->moving_context_lock); +} + @@ -72452,10 +73361,10 @@ index 000000000..052726739 +} diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h new file mode 100644 -index 000000000..547ee7b72 +index 000000000..c3136abe8 --- /dev/null +++ b/fs/bcachefs/move.h -@@ -0,0 +1,96 @@ +@@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_MOVE_H +#define _BCACHEFS_MOVE_H @@ -72546,7 +73455,6 @@ index 000000000..547ee7b72 + struct bch_ioctl_data); + +void bch2_move_stats_init(struct bch_move_stats *stats, char *name); -+void bch2_data_jobs_to_text(struct printbuf *, struct bch_fs *); +void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); + +void bch2_fs_move_init(struct bch_fs *); @@ -72596,10 +73504,10 @@ index 000000000..baf1f8570 +#endif /* _BCACHEFS_MOVE_TYPES_H */ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c new file mode 100644 -index 000000000..5242f20bb +index 000000000..256431a6d --- /dev/null +++ b/fs/bcachefs/movinggc.c -@@ -0,0 +1,421 @@ +@@ -0,0 +1,423 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Moving/copying garbage collector @@ -72822,8 +73730,10 @@ index 000000000..5242f20bb + + f = move_bucket_in_flight_add(buckets_in_flight, *i); + ret = PTR_ERR_OR_ZERO(f); -+ if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */ ++ if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */ ++ ret = 0; + continue; ++ } + if (ret == -ENOMEM) { /* flush IO, continue later */ + ret = 0; + break; @@ -73251,10 +74161,10 @@ index 000000000..bd12bf677 + diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c new file mode 100644 -index 000000000..9dcc61ee5 +index 000000000..960bb247f --- /dev/null +++ b/fs/bcachefs/opts.c -@@ -0,0 +1,592 @@ +@@ -0,0 +1,599 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include @@ -73267,7 +74177,7 @@ index 000000000..9dcc61ee5 +#include "super-io.h" +#include "util.h" + -+#define x(t, n) [n] = #t, ++#define x(t, n, ...) [n] = #t, + +const char * const bch2_error_actions[] = { + BCH_ERROR_ACTIONS() @@ -73352,8 +74262,8 @@ index 000000000..9dcc61ee5 + +#undef x + -+int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, -+ struct printbuf *err) ++static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, ++ struct printbuf *err) +{ + if (!val) { + *res = FSCK_FIX_yes; @@ -73370,18 +74280,18 @@ index 000000000..9dcc61ee5 + return 0; +} + -+void bch2_opt_fix_errors_to_text(struct printbuf *out, -+ struct bch_fs *c, -+ struct bch_sb *sb, -+ u64 v) ++static void bch2_opt_fix_errors_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct bch_sb *sb, ++ u64 v) +{ + prt_str(out, bch2_fsck_fix_opts[v]); +} + -+static const struct bch_opt_fn bch2_opt_fix_errors = { -+ .parse = bch2_opt_fix_errors_parse, -+ .to_text = bch2_opt_fix_errors_to_text, -+}; ++#define bch2_opt_fix_errors (struct bch_opt_fn) { \ ++ .parse = bch2_opt_fix_errors_parse, \ ++ .to_text = bch2_opt_fix_errors_to_text, \ ++} + +const char * const bch2_d_types[BCH_DT_MAX] = { + [DT_UNKNOWN] = "unknown", @@ -73702,6 +74612,13 @@ index 000000000..9dcc61ee5 + if (!options) + return 0; + ++ /* ++ * sys_fsconfig() is now occasionally providing us with option lists ++ * starting with a comma - weird. ++ */ ++ if (*options == ',') ++ options++; ++ + copied_opts = kstrdup(options, GFP_KERNEL); + if (!copied_opts) + return -1; @@ -75129,7 +76046,7 @@ index 000000000..2191423d9 +#endif /* _BCACHEFS_PRINTBUF_H */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c new file mode 100644 -index 000000000..4f0654ff8 +index 000000000..ca99772ae --- /dev/null +++ b/fs/bcachefs/quota.c @@ -0,0 +1,981 @@ @@ -75140,7 +76057,7 @@ index 000000000..4f0654ff8 +#include "error.h" +#include "inode.h" +#include "quota.h" -+#include "subvolume.h" ++#include "snapshot.h" +#include "super-io.h" + +static const char * const bch2_quota_types[] = { @@ -76245,10 +77162,10 @@ index 000000000..6a136083d +#endif /* _BCACHEFS_QUOTA_TYPES_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c new file mode 100644 -index 000000000..c3d577236 +index 000000000..15ce3ecba --- /dev/null +++ b/fs/bcachefs/rebalance.c -@@ -0,0 +1,364 @@ +@@ -0,0 +1,368 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -76364,6 +77281,10 @@ index 000000000..c3d577236 + unsigned percent_full; + u64 work = dev_work + unknown_dev; + ++ /* avoid divide by 0 */ ++ if (!capacity) ++ return; ++ + if (work < dev_work || work < unknown_dev) + work = U64_MAX; + work = min(work, capacity); @@ -76681,10 +77602,10 @@ index 000000000..7462a92e9 +#endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c new file mode 100644 -index 000000000..dcd4f9f41 +index 000000000..30efb3c90 --- /dev/null +++ b/fs/bcachefs/recovery.c -@@ -0,0 +1,1670 @@ +@@ -0,0 +1,1057 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -76692,6 +77613,7 @@ index 000000000..dcd4f9f41 +#include "bkey_buf.h" +#include "alloc_background.h" +#include "btree_gc.h" ++#include "btree_journal_iter.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_io.h" @@ -76710,6 +77632,8 @@ index 000000000..dcd4f9f41 +#include "quota.h" +#include "recovery.h" +#include "replicas.h" ++#include "sb-clean.h" ++#include "snapshot.h" +#include "subvolume.h" +#include "super-io.h" + @@ -76744,524 +77668,6 @@ index 000000000..dcd4f9f41 + bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; +} + -+/* iterate over keys read from the journal: */ -+ -+static int __journal_key_cmp(enum btree_id l_btree_id, -+ unsigned l_level, -+ struct bpos l_pos, -+ const struct journal_key *r) -+{ -+ return (cmp_int(l_btree_id, r->btree_id) ?: -+ cmp_int(l_level, r->level) ?: -+ bpos_cmp(l_pos, r->k->k.p)); -+} -+ -+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) -+{ -+ return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); -+} -+ -+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) -+{ -+ size_t gap_size = keys->size - keys->nr; -+ -+ if (idx >= keys->gap) -+ idx += gap_size; -+ return idx; -+} -+ -+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) -+{ -+ return keys->d + idx_to_pos(keys, idx); -+} -+ -+static size_t __bch2_journal_key_search(struct journal_keys *keys, -+ enum btree_id id, unsigned level, -+ struct bpos pos) -+{ -+ size_t l = 0, r = keys->nr, m; -+ -+ while (l < r) { -+ m = l + ((r - l) >> 1); -+ if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) -+ l = m + 1; -+ else -+ r = m; -+ } -+ -+ BUG_ON(l < keys->nr && -+ __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); -+ -+ BUG_ON(l && -+ __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); -+ -+ return l; -+} -+ -+static size_t bch2_journal_key_search(struct journal_keys *keys, -+ enum btree_id id, unsigned level, -+ struct bpos pos) -+{ -+ return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); -+} -+ -+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, -+ unsigned level, struct bpos pos, -+ struct bpos end_pos, size_t *idx) -+{ -+ struct journal_keys *keys = &c->journal_keys; -+ unsigned iters = 0; -+ struct journal_key *k; -+search: -+ if (!*idx) -+ *idx = __bch2_journal_key_search(keys, btree_id, level, pos); -+ -+ while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { -+ if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) -+ return NULL; -+ -+ if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && -+ !k->overwritten) -+ return k->k; -+ -+ (*idx)++; -+ iters++; -+ if (iters == 10) { -+ *idx = 0; -+ goto search; -+ } -+ } -+ -+ return NULL; -+} -+ -+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, -+ unsigned level, struct bpos pos) -+{ -+ size_t idx = 0; -+ -+ return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); -+} -+ -+static void journal_iters_fix(struct bch_fs *c) -+{ -+ struct journal_keys *keys = &c->journal_keys; -+ /* The key we just inserted is immediately before the gap: */ -+ size_t gap_end = keys->gap + (keys->size - keys->nr); -+ struct btree_and_journal_iter *iter; -+ -+ /* -+ * If an iterator points one after the key we just inserted, decrement -+ * the iterator so it points at the key we just inserted - if the -+ * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will -+ * handle that: -+ */ -+ list_for_each_entry(iter, &c->journal_iters, journal.list) -+ if (iter->journal.idx == gap_end) -+ iter->journal.idx = keys->gap - 1; -+} -+ -+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) -+{ -+ struct journal_keys *keys = &c->journal_keys; -+ struct journal_iter *iter; -+ size_t gap_size = keys->size - keys->nr; -+ -+ list_for_each_entry(iter, &c->journal_iters, list) { -+ if (iter->idx > old_gap) -+ iter->idx -= gap_size; -+ if (iter->idx >= new_gap) -+ iter->idx += gap_size; -+ } -+} -+ -+int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_i *k) -+{ -+ struct journal_key n = { -+ .btree_id = id, -+ .level = level, -+ .k = k, -+ .allocated = true, -+ /* -+ * Ensure these keys are done last by journal replay, to unblock -+ * journal reclaim: -+ */ -+ .journal_seq = U32_MAX, -+ }; -+ struct journal_keys *keys = &c->journal_keys; -+ size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); -+ -+ BUG_ON(test_bit(BCH_FS_RW, &c->flags)); -+ -+ if (idx < keys->size && -+ journal_key_cmp(&n, &keys->d[idx]) == 0) { -+ if (keys->d[idx].allocated) -+ kfree(keys->d[idx].k); -+ keys->d[idx] = n; -+ return 0; -+ } -+ -+ if (idx > keys->gap) -+ idx -= keys->size - keys->nr; -+ -+ if (keys->nr == keys->size) { -+ struct journal_keys new_keys = { -+ .nr = keys->nr, -+ .size = max_t(size_t, keys->size, 8) * 2, -+ }; -+ -+ new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); -+ if (!new_keys.d) { -+ bch_err(c, "%s: error allocating new key array (size %zu)", -+ __func__, new_keys.size); -+ return -BCH_ERR_ENOMEM_journal_key_insert; -+ } -+ -+ /* Since @keys was full, there was no gap: */ -+ memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); -+ kvfree(keys->d); -+ *keys = new_keys; -+ -+ /* And now the gap is at the end: */ -+ keys->gap = keys->nr; -+ } -+ -+ journal_iters_move_gap(c, keys->gap, idx); -+ -+ move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); -+ keys->gap = idx; -+ -+ keys->nr++; -+ keys->d[keys->gap++] = n; -+ -+ journal_iters_fix(c); -+ -+ return 0; -+} -+ -+/* -+ * Can only be used from the recovery thread while we're still RO - can't be -+ * used once we've got RW, as journal_keys is at that point used by multiple -+ * threads: -+ */ -+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_i *k) -+{ -+ struct bkey_i *n; -+ int ret; -+ -+ n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); -+ if (!n) -+ return -BCH_ERR_ENOMEM_journal_key_insert; -+ -+ bkey_copy(n, k); -+ ret = bch2_journal_key_insert_take(c, id, level, n); -+ if (ret) -+ kfree(n); -+ return ret; -+} -+ -+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bpos pos) -+{ -+ struct bkey_i whiteout; -+ -+ bkey_init(&whiteout.k); -+ whiteout.k.p = pos; -+ -+ return bch2_journal_key_insert(c, id, level, &whiteout); -+} -+ -+void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, -+ unsigned level, struct bpos pos) -+{ -+ struct journal_keys *keys = &c->journal_keys; -+ size_t idx = bch2_journal_key_search(keys, btree, level, pos); -+ -+ if (idx < keys->size && -+ keys->d[idx].btree_id == btree && -+ keys->d[idx].level == level && -+ bpos_eq(keys->d[idx].k->k.p, pos)) -+ keys->d[idx].overwritten = true; -+} -+ -+static void bch2_journal_iter_advance(struct journal_iter *iter) -+{ -+ if (iter->idx < iter->keys->size) { -+ iter->idx++; -+ if (iter->idx == iter->keys->gap) -+ iter->idx += iter->keys->size - iter->keys->nr; -+ } -+} -+ -+static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) -+{ -+ struct journal_key *k = iter->keys->d + iter->idx; -+ -+ while (k < iter->keys->d + iter->keys->size && -+ k->btree_id == iter->btree_id && -+ k->level == iter->level) { -+ if (!k->overwritten) -+ return bkey_i_to_s_c(k->k); -+ -+ bch2_journal_iter_advance(iter); -+ k = iter->keys->d + iter->idx; -+ } -+ -+ return bkey_s_c_null; -+} -+ -+static void bch2_journal_iter_exit(struct journal_iter *iter) -+{ -+ list_del(&iter->list); -+} -+ -+static void bch2_journal_iter_init(struct bch_fs *c, -+ struct journal_iter *iter, -+ enum btree_id id, unsigned level, -+ struct bpos pos) -+{ -+ iter->btree_id = id; -+ iter->level = level; -+ iter->keys = &c->journal_keys; -+ iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); -+} -+ -+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) -+{ -+ return bch2_btree_node_iter_peek_unpack(&iter->node_iter, -+ iter->b, &iter->unpacked); -+} -+ -+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) -+{ -+ bch2_btree_node_iter_advance(&iter->node_iter, iter->b); -+} -+ -+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) -+{ -+ if (bpos_eq(iter->pos, SPOS_MAX)) -+ iter->at_end = true; -+ else -+ iter->pos = bpos_successor(iter->pos); -+} -+ -+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) -+{ -+ struct bkey_s_c btree_k, journal_k, ret; -+again: -+ if (iter->at_end) -+ return bkey_s_c_null; -+ -+ while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && -+ bpos_lt(btree_k.k->p, iter->pos)) -+ bch2_journal_iter_advance_btree(iter); -+ -+ while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && -+ bpos_lt(journal_k.k->p, iter->pos)) -+ bch2_journal_iter_advance(&iter->journal); -+ -+ ret = journal_k.k && -+ (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) -+ ? journal_k -+ : btree_k; -+ -+ if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) -+ ret = bkey_s_c_null; -+ -+ if (ret.k) { -+ iter->pos = ret.k->p; -+ if (bkey_deleted(ret.k)) { -+ bch2_btree_and_journal_iter_advance(iter); -+ goto again; -+ } -+ } else { -+ iter->pos = SPOS_MAX; -+ iter->at_end = true; -+ } -+ -+ return ret; -+} -+ -+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) -+{ -+ bch2_journal_iter_exit(&iter->journal); -+} -+ -+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, -+ struct bch_fs *c, -+ struct btree *b, -+ struct btree_node_iter node_iter, -+ struct bpos pos) -+{ -+ memset(iter, 0, sizeof(*iter)); -+ -+ iter->b = b; -+ iter->node_iter = node_iter; -+ bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); -+ INIT_LIST_HEAD(&iter->journal.list); -+ iter->pos = b->data->min_key; -+ iter->at_end = false; -+} -+ -+/* -+ * this version is used by btree_gc before filesystem has gone RW and -+ * multithreaded, so uses the journal_iters list: -+ */ -+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, -+ struct bch_fs *c, -+ struct btree *b) -+{ -+ struct btree_node_iter node_iter; -+ -+ bch2_btree_node_iter_init_from_start(&node_iter, b); -+ __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); -+ list_add(&iter->journal.list, &c->journal_iters); -+} -+ -+/* sort and dedup all keys in the journal: */ -+ -+void bch2_journal_entries_free(struct bch_fs *c) -+{ -+ struct journal_replay **i; -+ struct genradix_iter iter; -+ -+ genradix_for_each(&c->journal_entries, iter, i) -+ if (*i) -+ kvpfree(*i, offsetof(struct journal_replay, j) + -+ vstruct_bytes(&(*i)->j)); -+ genradix_free(&c->journal_entries); -+} -+ -+/* -+ * When keys compare equal, oldest compares first: -+ */ -+static int journal_sort_key_cmp(const void *_l, const void *_r) -+{ -+ const struct journal_key *l = _l; -+ const struct journal_key *r = _r; -+ -+ return journal_key_cmp(l, r) ?: -+ cmp_int(l->journal_seq, r->journal_seq) ?: -+ cmp_int(l->journal_offset, r->journal_offset); -+} -+ -+void bch2_journal_keys_free(struct journal_keys *keys) -+{ -+ struct journal_key *i; -+ -+ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); -+ keys->gap = keys->nr; -+ -+ for (i = keys->d; i < keys->d + keys->nr; i++) -+ if (i->allocated) -+ kfree(i->k); -+ -+ kvfree(keys->d); -+ keys->d = NULL; -+ keys->nr = keys->gap = keys->size = 0; -+} -+ -+static void __journal_keys_sort(struct journal_keys *keys) -+{ -+ struct journal_key *src, *dst; -+ -+ sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); -+ -+ src = dst = keys->d; -+ while (src < keys->d + keys->nr) { -+ while (src + 1 < keys->d + keys->nr && -+ src[0].btree_id == src[1].btree_id && -+ src[0].level == src[1].level && -+ bpos_eq(src[0].k->k.p, src[1].k->k.p)) -+ src++; -+ -+ *dst++ = *src++; -+ } -+ -+ keys->nr = dst - keys->d; -+} -+ -+static int journal_keys_sort(struct bch_fs *c) -+{ -+ struct genradix_iter iter; -+ struct journal_replay *i, **_i; -+ struct jset_entry *entry; -+ struct bkey_i *k; -+ struct journal_keys *keys = &c->journal_keys; -+ size_t nr_keys = 0, nr_read = 0; -+ -+ genradix_for_each(&c->journal_entries, iter, _i) { -+ i = *_i; -+ -+ if (!i || i->ignore) -+ continue; -+ -+ for_each_jset_key(k, entry, &i->j) -+ nr_keys++; -+ } -+ -+ if (!nr_keys) -+ return 0; -+ -+ keys->size = roundup_pow_of_two(nr_keys); -+ -+ keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); -+ if (!keys->d) { -+ bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", -+ nr_keys); -+ -+ do { -+ keys->size >>= 1; -+ keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); -+ } while (!keys->d && keys->size > nr_keys / 8); -+ -+ if (!keys->d) { -+ bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", -+ keys->size); -+ return -BCH_ERR_ENOMEM_journal_keys_sort; -+ } -+ } -+ -+ genradix_for_each(&c->journal_entries, iter, _i) { -+ i = *_i; -+ -+ if (!i || i->ignore) -+ continue; -+ -+ cond_resched(); -+ -+ for_each_jset_key(k, entry, &i->j) { -+ if (keys->nr == keys->size) { -+ __journal_keys_sort(keys); -+ -+ if (keys->nr > keys->size * 7 / 8) { -+ bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", -+ keys->nr, keys->size, nr_read, nr_keys); -+ return -BCH_ERR_ENOMEM_journal_keys_sort; -+ } -+ } -+ -+ keys->d[keys->nr++] = (struct journal_key) { -+ .btree_id = entry->btree_id, -+ .level = entry->level, -+ .k = k, -+ .journal_seq = le64_to_cpu(i->j.seq), -+ .journal_offset = k->_data - i->j._data, -+ }; -+ -+ nr_read++; -+ } -+ } -+ -+ __journal_keys_sort(keys); -+ keys->gap = keys->nr; -+ -+ bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); -+ return 0; -+} -+ +/* journal replay: */ + +static void replay_now_at(struct journal *j, u64 seq) @@ -77335,7 +77741,7 @@ index 000000000..dcd4f9f41 + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + keys->gap = keys->nr; + -+ keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); ++ keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL); + if (!keys_sorted) + return -BCH_ERR_ENOMEM_journal_replay; + @@ -77533,134 +77939,6 @@ index 000000000..dcd4f9f41 + +/* sb clean section: */ + -+static struct bkey_i *btree_root_find(struct bch_fs *c, -+ struct bch_sb_field_clean *clean, -+ struct jset *j, -+ enum btree_id id, unsigned *level) -+{ -+ struct bkey_i *k; -+ struct jset_entry *entry, *start, *end; -+ -+ if (clean) { -+ start = clean->start; -+ end = vstruct_end(&clean->field); -+ } else { -+ start = j->start; -+ end = vstruct_last(j); -+ } -+ -+ for (entry = start; entry < end; entry = vstruct_next(entry)) -+ if (entry->type == BCH_JSET_ENTRY_btree_root && -+ entry->btree_id == id) -+ goto found; -+ -+ return NULL; -+found: -+ if (!entry->u64s) -+ return ERR_PTR(-EINVAL); -+ -+ k = entry->start; -+ *level = entry->level; -+ return k; -+} -+ -+static int verify_superblock_clean(struct bch_fs *c, -+ struct bch_sb_field_clean **cleanp, -+ struct jset *j) -+{ -+ unsigned i; -+ struct bch_sb_field_clean *clean = *cleanp; -+ struct printbuf buf1 = PRINTBUF; -+ struct printbuf buf2 = PRINTBUF; -+ int ret = 0; -+ -+ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, -+ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", -+ le64_to_cpu(clean->journal_seq), -+ le64_to_cpu(j->seq))) { -+ kfree(clean); -+ *cleanp = NULL; -+ return 0; -+ } -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ struct bkey_i *k1, *k2; -+ unsigned l1 = 0, l2 = 0; -+ -+ k1 = btree_root_find(c, clean, NULL, i, &l1); -+ k2 = btree_root_find(c, NULL, j, i, &l2); -+ -+ if (!k1 && !k2) -+ continue; -+ -+ printbuf_reset(&buf1); -+ printbuf_reset(&buf2); -+ -+ if (k1) -+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); -+ else -+ prt_printf(&buf1, "(none)"); -+ -+ if (k2) -+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); -+ else -+ prt_printf(&buf2, "(none)"); -+ -+ mustfix_fsck_err_on(!k1 || !k2 || -+ IS_ERR(k1) || -+ IS_ERR(k2) || -+ k1->k.u64s != k2->k.u64s || -+ memcmp(k1, k2, bkey_bytes(&k1->k)) || -+ l1 != l2, c, -+ "superblock btree root %u doesn't match journal after clean shutdown\n" -+ "sb: l=%u %s\n" -+ "journal: l=%u %s\n", i, -+ l1, buf1.buf, -+ l2, buf2.buf); -+ } -+fsck_err: -+ printbuf_exit(&buf2); -+ printbuf_exit(&buf1); -+ return ret; -+} -+ -+static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) -+{ -+ struct bch_sb_field_clean *clean, *sb_clean; -+ int ret; -+ -+ mutex_lock(&c->sb_lock); -+ sb_clean = bch2_sb_get_clean(c->disk_sb.sb); -+ -+ if (fsck_err_on(!sb_clean, c, -+ "superblock marked clean but clean section not present")) { -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ c->sb.clean = false; -+ mutex_unlock(&c->sb_lock); -+ return NULL; -+ } -+ -+ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), -+ GFP_KERNEL); -+ if (!clean) { -+ mutex_unlock(&c->sb_lock); -+ return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); -+ } -+ -+ ret = bch2_sb_clean_validate_late(c, clean, READ); -+ if (ret) { -+ mutex_unlock(&c->sb_lock); -+ return ERR_PTR(ret); -+ } -+ -+ mutex_unlock(&c->sb_lock); -+ -+ return clean; -+fsck_err: -+ mutex_unlock(&c->sb_lock); -+ return ERR_PTR(ret); -+} -+ +static bool btree_id_is_alloc(enum btree_id id) +{ + switch (id) { @@ -77807,9 +78085,38 @@ index 000000000..dcd4f9f41 + return ret; +} + ++const char * const bch2_recovery_passes[] = { ++#define x(_fn, _when) #_fn, ++ BCH_RECOVERY_PASSES() ++#undef x ++ NULL ++}; ++ ++static int bch2_check_allocations(struct bch_fs *c) ++{ ++ return bch2_gc(c, true, c->opts.norecovery); ++} ++ ++static int bch2_set_may_go_rw(struct bch_fs *c) ++{ ++ set_bit(BCH_FS_MAY_GO_RW, &c->flags); ++ return 0; ++} ++ ++struct recovery_pass_fn { ++ int (*fn)(struct bch_fs *); ++ unsigned when; ++}; ++ ++static struct recovery_pass_fn recovery_pass_fns[] = { ++#define x(_fn, _when) { .fn = bch2_##_fn, .when = _when }, ++ BCH_RECOVERY_PASSES() ++#undef x ++}; ++ +static void check_version_upgrade(struct bch_fs *c) +{ -+ unsigned latest_compatible = bch2_version_compatible(c->sb.version); ++ unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version); + unsigned latest_version = bcachefs_metadata_version_current; + unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; + unsigned new_version = 0; @@ -77859,7 +78166,12 @@ index 000000000..dcd4f9f41 + + recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version); + if (recovery_passes) { -+ prt_str(&buf, "fsck required"); ++ if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK) ++ prt_str(&buf, "fsck required"); ++ else { ++ prt_str(&buf, "running recovery passses: "); ++ prt_bitflags(&buf, bch2_recovery_passes, recovery_passes); ++ } + + c->recovery_passes_explicit |= recovery_passes; + c->opts.fix_errors = FSCK_FIX_yes; @@ -77875,42 +78187,19 @@ index 000000000..dcd4f9f41 + } +} + -+static int bch2_check_allocations(struct bch_fs *c) -+{ -+ return bch2_gc(c, true, c->opts.norecovery); -+} -+ -+static int bch2_set_may_go_rw(struct bch_fs *c) -+{ -+ set_bit(BCH_FS_MAY_GO_RW, &c->flags); -+ return 0; -+} -+ -+struct recovery_pass_fn { -+ int (*fn)(struct bch_fs *); -+ const char *name; -+ unsigned when; -+}; -+ -+static struct recovery_pass_fn recovery_passes[] = { -+#define x(_fn, _when) { .fn = bch2_##_fn, .name = #_fn, .when = _when }, -+ BCH_RECOVERY_PASSES() -+#undef x -+}; -+ +u64 bch2_fsck_recovery_passes(void) +{ + u64 ret = 0; + -+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++) -+ if (recovery_passes[i].when & PASS_FSCK) ++ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) ++ if (recovery_pass_fns[i].when & PASS_FSCK) + ret |= BIT_ULL(i); + return ret; +} + +static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ -+ struct recovery_pass_fn *p = recovery_passes + c->curr_recovery_pass; ++ struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass; + + if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) + return false; @@ -77932,15 +78221,18 @@ index 000000000..dcd4f9f41 + c->curr_recovery_pass = pass; + + if (should_run_recovery_pass(c, pass)) { -+ struct recovery_pass_fn *p = recovery_passes + pass; ++ struct recovery_pass_fn *p = recovery_pass_fns + pass; + + if (!(p->when & PASS_SILENT)) -+ printk(KERN_INFO bch2_log_msg(c, "%s..."), p->name); ++ printk(KERN_INFO bch2_log_msg(c, "%s..."), ++ bch2_recovery_passes[pass]); + ret = p->fn(c); + if (ret) + return ret; + if (!(p->when & PASS_SILENT)) + printk(KERN_CONT " done\n"); ++ ++ c->recovery_passes_complete |= BIT_ULL(pass); + } + + return 0; @@ -77950,7 +78242,7 @@ index 000000000..dcd4f9f41 +{ + int ret = 0; + -+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_passes)) { ++ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) + continue; @@ -77970,17 +78262,17 @@ index 000000000..dcd4f9f41 + bool write_sb = false; + int ret = 0; + -+ if (c->sb.clean) -+ clean = read_superblock_clean(c); -+ ret = PTR_ERR_OR_ZERO(clean); -+ if (ret) -+ goto err; ++ if (c->sb.clean) { ++ clean = bch2_read_superblock_clean(c); ++ ret = PTR_ERR_OR_ZERO(clean); ++ if (ret) ++ goto err; + -+ if (c->sb.clean) + bch_info(c, "recovering from clean shutdown, journal seq %llu", + le64_to_cpu(clean->journal_seq)); -+ else ++ } else { + bch_info(c, "recovering from unclean shutdown"); ++ } + + if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { + bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); @@ -77995,12 +78287,6 @@ index 000000000..dcd4f9f41 + goto err; + } + -+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { -+ bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix"); -+ ret = -EINVAL; -+ goto err; -+ } -+ + if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery)) + check_version_upgrade(c); + @@ -78060,12 +78346,12 @@ index 000000000..dcd4f9f41 + } + } + -+ ret = journal_keys_sort(c); ++ ret = bch2_journal_keys_sort(c); + if (ret) + goto err; + + if (c->sb.clean && last_journal_entry) { -+ ret = verify_superblock_clean(c, &clean, ++ ret = bch2_verify_superblock_clean(c, &clean, + last_journal_entry); + if (ret) + goto err; @@ -78082,7 +78368,7 @@ index 000000000..dcd4f9f41 + } + + c->journal_replay_seq_start = last_seq; -+ c->journal_replay_seq_end = blacklist_seq - 1;; ++ c->journal_replay_seq_end = blacklist_seq - 1; + + if (c->opts.reconstruct_alloc) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); @@ -78145,6 +78431,29 @@ index 000000000..dcd4f9f41 + if (ret) + goto err; + ++ /* If we fixed errors, verify that fs is actually clean now: */ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && ++ test_bit(BCH_FS_ERRORS_FIXED, &c->flags) && ++ !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) && ++ !test_bit(BCH_FS_ERROR, &c->flags)) { ++ bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); ++ clear_bit(BCH_FS_ERRORS_FIXED, &c->flags); ++ ++ c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; ++ ++ ret = bch2_run_recovery_passes(c); ++ if (ret) ++ goto err; ++ ++ if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) || ++ test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { ++ bch_err(c, "Second fsck run was not clean"); ++ set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); ++ } ++ ++ set_bit(BCH_FS_ERRORS_FIXED, &c->flags); ++ } ++ + if (enabled_qtypes(c)) { + bch_verbose(c, "reading quotas"); + ret = bch2_fs_quota_read(c); @@ -78177,7 +78486,6 @@ index 000000000..dcd4f9f41 + mutex_unlock(&c->sb_lock); + + if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || -+ !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || + c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { + struct bch_move_stats stats; + @@ -78245,7 +78553,7 @@ index 000000000..dcd4f9f41 + } + mutex_unlock(&c->sb_lock); + -+ c->curr_recovery_pass = ARRAY_SIZE(recovery_passes); ++ c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); + set_bit(BCH_FS_MAY_GO_RW, &c->flags); + set_bit(BCH_FS_FSCK_DONE, &c->flags); + @@ -78357,63 +78665,36 @@ index 000000000..dcd4f9f41 +} diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h new file mode 100644 -index 000000000..f8e796c0f +index 000000000..852d30567 --- /dev/null +++ b/fs/bcachefs/recovery.h -@@ -0,0 +1,60 @@ +@@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_H +#define _BCACHEFS_RECOVERY_H + -+struct journal_iter { -+ struct list_head list; -+ enum btree_id btree_id; -+ unsigned level; -+ size_t idx; -+ struct journal_keys *keys; -+}; ++extern const char * const bch2_recovery_passes[]; + +/* -+ * Iterate over keys in the btree, with keys from the journal overlaid on top: ++ * For when we need to rewind recovery passes and run a pass we skipped: + */ ++static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, ++ enum bch_recovery_pass pass) ++{ ++ bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", ++ bch2_recovery_passes[pass], pass, ++ bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + -+struct btree_and_journal_iter { -+ struct btree *b; -+ struct btree_node_iter node_iter; -+ struct bkey unpacked; ++ c->recovery_passes_explicit |= BIT_ULL(pass); + -+ struct journal_iter journal; -+ struct bpos pos; -+ bool at_end; -+}; -+ -+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, -+ unsigned, struct bpos, struct bpos, size_t *); -+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, -+ unsigned, struct bpos); -+ -+int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, -+ unsigned, struct bkey_i *); -+int bch2_journal_key_insert(struct bch_fs *, enum btree_id, -+ unsigned, struct bkey_i *); -+int bch2_journal_key_delete(struct bch_fs *, enum btree_id, -+ unsigned, struct bpos); -+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, -+ unsigned, struct bpos); -+ -+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); -+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); -+ -+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); -+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, -+ struct bch_fs *, struct btree *, -+ struct btree_node_iter, struct bpos); -+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, -+ struct bch_fs *, -+ struct btree *); -+ -+void bch2_journal_keys_free(struct journal_keys *); -+void bch2_journal_entries_free(struct bch_fs *); ++ if (c->curr_recovery_pass >= pass) { ++ c->curr_recovery_pass = pass; ++ c->recovery_passes_complete &= (1ULL << pass) >> 1; ++ return -BCH_ERR_restart_recovery; ++ } else { ++ return 0; ++ } ++} + +u64 bch2_fsck_recovery_passes(void); + @@ -78421,6 +78702,60 @@ index 000000000..f8e796c0f +int bch2_fs_initialize(struct bch_fs *); + +#endif /* _BCACHEFS_RECOVERY_H */ +diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h +new file mode 100644 +index 000000000..abf1f834e +--- /dev/null ++++ b/fs/bcachefs/recovery_types.h +@@ -0,0 +1,48 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_RECOVERY_TYPES_H ++#define _BCACHEFS_RECOVERY_TYPES_H ++ ++#define PASS_SILENT BIT(0) ++#define PASS_FSCK BIT(1) ++#define PASS_UNCLEAN BIT(2) ++#define PASS_ALWAYS BIT(3) ++ ++#define BCH_RECOVERY_PASSES() \ ++ x(alloc_read, PASS_ALWAYS) \ ++ x(stripes_read, PASS_ALWAYS) \ ++ x(initialize_subvolumes, 0) \ ++ x(snapshots_read, PASS_ALWAYS) \ ++ x(check_topology, 0) \ ++ x(check_allocations, PASS_FSCK) \ ++ x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \ ++ x(journal_replay, PASS_ALWAYS) \ ++ x(check_alloc_info, PASS_FSCK) \ ++ x(check_lrus, PASS_FSCK) \ ++ x(check_btree_backpointers, PASS_FSCK) \ ++ x(check_backpointers_to_extents,PASS_FSCK) \ ++ x(check_extents_to_backpointers,PASS_FSCK) \ ++ x(check_alloc_to_lru_refs, PASS_FSCK) \ ++ x(fs_freespace_init, PASS_ALWAYS|PASS_SILENT) \ ++ x(bucket_gens_init, 0) \ ++ x(check_snapshot_trees, PASS_FSCK) \ ++ x(check_snapshots, PASS_FSCK) \ ++ x(check_subvols, PASS_FSCK) \ ++ x(delete_dead_snapshots, PASS_FSCK|PASS_UNCLEAN) \ ++ x(fs_upgrade_for_subvolumes, 0) \ ++ x(check_inodes, PASS_FSCK) \ ++ x(check_extents, PASS_FSCK) \ ++ x(check_dirents, PASS_FSCK) \ ++ x(check_xattrs, PASS_FSCK) \ ++ x(check_root, PASS_FSCK) \ ++ x(check_directory_structure, PASS_FSCK) \ ++ x(check_nlinks, PASS_FSCK) \ ++ x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \ ++ x(fix_reflink_p, 0) \ ++ ++enum bch_recovery_pass { ++#define x(n, when) BCH_RECOVERY_PASS_##n, ++ BCH_RECOVERY_PASSES() ++#undef x ++}; ++ ++#endif /* _BCACHEFS_RECOVERY_TYPES_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c new file mode 100644 index 000000000..39f711d50 @@ -80108,6 +80443,790 @@ index 000000000..5cfff489b +}; + +#endif /* _BCACHEFS_REPLICAS_TYPES_H */ +diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c +new file mode 100644 +index 000000000..a3695e56a +--- /dev/null ++++ b/fs/bcachefs/sb-clean.c +@@ -0,0 +1,395 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "error.h" ++#include "journal_io.h" ++#include "replicas.h" ++#include "sb-clean.h" ++#include "super-io.h" ++ ++/* ++ * BCH_SB_FIELD_clean: ++ * ++ * Btree roots, and a few other things, are recovered from the journal after an ++ * unclean shutdown - but after a clean shutdown, to avoid having to read the ++ * journal, we can store them in the superblock. ++ * ++ * bch_sb_field_clean simply contains a list of journal entries, stored exactly ++ * as they would be in the journal: ++ */ ++ ++int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, ++ int write) ++{ ++ struct jset_entry *entry; ++ int ret; ++ ++ for (entry = clean->start; ++ entry < (struct jset_entry *) vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) { ++ ret = bch2_journal_entry_validate(c, NULL, entry, ++ le16_to_cpu(c->disk_sb.sb->version), ++ BCH_SB_BIG_ENDIAN(c->disk_sb.sb), ++ write); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static struct bkey_i *btree_root_find(struct bch_fs *c, ++ struct bch_sb_field_clean *clean, ++ struct jset *j, ++ enum btree_id id, unsigned *level) ++{ ++ struct bkey_i *k; ++ struct jset_entry *entry, *start, *end; ++ ++ if (clean) { ++ start = clean->start; ++ end = vstruct_end(&clean->field); ++ } else { ++ start = j->start; ++ end = vstruct_last(j); ++ } ++ ++ for (entry = start; entry < end; entry = vstruct_next(entry)) ++ if (entry->type == BCH_JSET_ENTRY_btree_root && ++ entry->btree_id == id) ++ goto found; ++ ++ return NULL; ++found: ++ if (!entry->u64s) ++ return ERR_PTR(-EINVAL); ++ ++ k = entry->start; ++ *level = entry->level; ++ return k; ++} ++ ++int bch2_verify_superblock_clean(struct bch_fs *c, ++ struct bch_sb_field_clean **cleanp, ++ struct jset *j) ++{ ++ unsigned i; ++ struct bch_sb_field_clean *clean = *cleanp; ++ struct printbuf buf1 = PRINTBUF; ++ struct printbuf buf2 = PRINTBUF; ++ int ret = 0; ++ ++ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, ++ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", ++ le64_to_cpu(clean->journal_seq), ++ le64_to_cpu(j->seq))) { ++ kfree(clean); ++ *cleanp = NULL; ++ return 0; ++ } ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ struct bkey_i *k1, *k2; ++ unsigned l1 = 0, l2 = 0; ++ ++ k1 = btree_root_find(c, clean, NULL, i, &l1); ++ k2 = btree_root_find(c, NULL, j, i, &l2); ++ ++ if (!k1 && !k2) ++ continue; ++ ++ printbuf_reset(&buf1); ++ printbuf_reset(&buf2); ++ ++ if (k1) ++ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); ++ else ++ prt_printf(&buf1, "(none)"); ++ ++ if (k2) ++ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); ++ else ++ prt_printf(&buf2, "(none)"); ++ ++ mustfix_fsck_err_on(!k1 || !k2 || ++ IS_ERR(k1) || ++ IS_ERR(k2) || ++ k1->k.u64s != k2->k.u64s || ++ memcmp(k1, k2, bkey_bytes(&k1->k)) || ++ l1 != l2, c, ++ "superblock btree root %u doesn't match journal after clean shutdown\n" ++ "sb: l=%u %s\n" ++ "journal: l=%u %s\n", i, ++ l1, buf1.buf, ++ l2, buf2.buf); ++ } ++fsck_err: ++ printbuf_exit(&buf2); ++ printbuf_exit(&buf1); ++ return ret; ++} ++ ++struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) ++{ ++ struct bch_sb_field_clean *clean, *sb_clean; ++ int ret; ++ ++ mutex_lock(&c->sb_lock); ++ sb_clean = bch2_sb_get_clean(c->disk_sb.sb); ++ ++ if (fsck_err_on(!sb_clean, c, ++ "superblock marked clean but clean section not present")) { ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->sb.clean = false; ++ mutex_unlock(&c->sb_lock); ++ return NULL; ++ } ++ ++ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), ++ GFP_KERNEL); ++ if (!clean) { ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); ++ } ++ ++ ret = bch2_sb_clean_validate_late(c, clean, READ); ++ if (ret) { ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(ret); ++ } ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return clean; ++fsck_err: ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(ret); ++} ++ ++static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) ++{ ++ struct jset_entry *entry = *end; ++ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); ++ ++ memset(entry, 0, u64s * sizeof(u64)); ++ /* ++ * The u64s field counts from the start of data, ignoring the shared ++ * fields. ++ */ ++ entry->u64s = cpu_to_le16(u64s - 1); ++ ++ *end = vstruct_next(*end); ++ return entry; ++} ++ ++void bch2_journal_super_entries_add_common(struct bch_fs *c, ++ struct jset_entry **end, ++ u64 journal_seq) ++{ ++ struct bch_dev *ca; ++ unsigned i, dev; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ if (!journal_seq) { ++ for (i = 0; i < ARRAY_SIZE(c->usage); i++) ++ bch2_fs_usage_acc_to_base(c, i); ++ } else { ++ bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); ++ } ++ ++ { ++ struct jset_entry_usage *u = ++ container_of(jset_entry_init(end, sizeof(*u)), ++ struct jset_entry_usage, entry); ++ ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = BCH_FS_USAGE_inodes; ++ u->v = cpu_to_le64(c->usage_base->nr_inodes); ++ } ++ ++ { ++ struct jset_entry_usage *u = ++ container_of(jset_entry_init(end, sizeof(*u)), ++ struct jset_entry_usage, entry); ++ ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = BCH_FS_USAGE_key_version; ++ u->v = cpu_to_le64(atomic64_read(&c->key_version)); ++ } ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) { ++ struct jset_entry_usage *u = ++ container_of(jset_entry_init(end, sizeof(*u)), ++ struct jset_entry_usage, entry); ++ ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = BCH_FS_USAGE_reserved; ++ u->entry.level = i; ++ u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ struct jset_entry_data_usage *u = ++ container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), ++ struct jset_entry_data_usage, entry); ++ ++ u->entry.type = BCH_JSET_ENTRY_data_usage; ++ u->v = cpu_to_le64(c->usage_base->replicas[i]); ++ unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), ++ "embedded variable length struct"); ++ } ++ ++ for_each_member_device(ca, c, dev) { ++ unsigned b = sizeof(struct jset_entry_dev_usage) + ++ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; ++ struct jset_entry_dev_usage *u = ++ container_of(jset_entry_init(end, b), ++ struct jset_entry_dev_usage, entry); ++ ++ u->entry.type = BCH_JSET_ENTRY_dev_usage; ++ u->dev = cpu_to_le32(dev); ++ u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); ++ ++ for (i = 0; i < BCH_DATA_NR; i++) { ++ u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); ++ u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); ++ u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); ++ } ++ } ++ ++ percpu_up_read(&c->mark_lock); ++ ++ for (i = 0; i < 2; i++) { ++ struct jset_entry_clock *clock = ++ container_of(jset_entry_init(end, sizeof(*clock)), ++ struct jset_entry_clock, entry); ++ ++ clock->entry.type = BCH_JSET_ENTRY_clock; ++ clock->rw = i; ++ clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); ++ } ++} ++ ++static int bch2_sb_clean_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_clean *clean = field_to_type(f, clean); ++ ++ if (vstruct_bytes(&clean->field) < sizeof(*clean)) { ++ prt_printf(err, "wrong size (got %zu should be %zu)", ++ vstruct_bytes(&clean->field), sizeof(*clean)); ++ return -BCH_ERR_invalid_sb_clean; ++ } ++ ++ return 0; ++} ++ ++static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_clean *clean = field_to_type(f, clean); ++ struct jset_entry *entry; ++ ++ prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); ++ prt_newline(out); ++ prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); ++ prt_newline(out); ++ ++ for (entry = clean->start; ++ entry != vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) { ++ if (entry->type == BCH_JSET_ENTRY_btree_keys && ++ !entry->u64s) ++ continue; ++ ++ bch2_journal_entry_to_text(out, NULL, entry); ++ prt_newline(out); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_clean = { ++ .validate = bch2_sb_clean_validate, ++ .to_text = bch2_sb_clean_to_text, ++}; ++ ++int bch2_fs_mark_dirty(struct bch_fs *c) ++{ ++ int ret; ++ ++ /* ++ * Unconditionally write superblock, to verify it hasn't changed before ++ * we go rw: ++ */ ++ ++ mutex_lock(&c->sb_lock); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ ++ bch2_sb_maybe_downgrade(c); ++ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); ++ ++ ret = bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++void bch2_fs_mark_clean(struct bch_fs *c) ++{ ++ struct bch_sb_field_clean *sb_clean; ++ struct jset_entry *entry; ++ unsigned u64s; ++ int ret; ++ ++ mutex_lock(&c->sb_lock); ++ if (BCH_SB_CLEAN(c->disk_sb.sb)) ++ goto out; ++ ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, true); ++ ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); ++ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); ++ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); ++ ++ u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; ++ ++ sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); ++ if (!sb_clean) { ++ bch_err(c, "error resizing superblock while setting filesystem clean"); ++ goto out; ++ } ++ ++ sb_clean->flags = 0; ++ sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); ++ ++ /* Trying to catch outstanding bug: */ ++ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); ++ ++ entry = sb_clean->start; ++ bch2_journal_super_entries_add_common(c, &entry, 0); ++ entry = bch2_btree_roots_to_journal_entries(c, entry, entry); ++ BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); ++ ++ memset(entry, 0, ++ vstruct_end(&sb_clean->field) - (void *) entry); ++ ++ /* ++ * this should be in the write path, and we should be validating every ++ * superblock section: ++ */ ++ ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); ++ if (ret) { ++ bch_err(c, "error writing marking filesystem clean: validate error"); ++ goto out; ++ } ++ ++ bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++} +diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h +new file mode 100644 +index 000000000..71caef281 +--- /dev/null ++++ b/fs/bcachefs/sb-clean.h +@@ -0,0 +1,16 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SB_CLEAN_H ++#define _BCACHEFS_SB_CLEAN_H ++ ++int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); ++int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **, ++ struct jset *); ++struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *); ++void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64); ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_clean; ++ ++int bch2_fs_mark_dirty(struct bch_fs *); ++void bch2_fs_mark_clean(struct bch_fs *); ++ ++#endif /* _BCACHEFS_SB_CLEAN_H */ +diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c +new file mode 100644 +index 000000000..16a2b3389 +--- /dev/null ++++ b/fs/bcachefs/sb-members.c +@@ -0,0 +1,173 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "disk_groups.h" ++#include "replicas.h" ++#include "sb-members.h" ++#include "super-io.h" ++ ++/* Code for bch_sb_field_members: */ ++ ++static int bch2_sb_members_validate(struct bch_sb *sb, ++ struct bch_sb_field *f, ++ struct printbuf *err) ++{ ++ struct bch_sb_field_members *mi = field_to_type(f, members); ++ unsigned i; ++ ++ if ((void *) (mi->members + sb->nr_devices) > ++ vstruct_end(&mi->field)) { ++ prt_printf(err, "too many devices for section size"); ++ return -BCH_ERR_invalid_sb_members; ++ } ++ ++ for (i = 0; i < sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; ++ ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ if (le64_to_cpu(m->nbuckets) > LONG_MAX) { ++ prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", ++ i, le64_to_cpu(m->nbuckets), LONG_MAX); ++ return -BCH_ERR_invalid_sb_members; ++ } ++ ++ if (le64_to_cpu(m->nbuckets) - ++ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) { ++ prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", ++ i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS); ++ return -BCH_ERR_invalid_sb_members; ++ } ++ ++ if (le16_to_cpu(m->bucket_size) < ++ le16_to_cpu(sb->block_size)) { ++ prt_printf(err, "device %u: bucket size %u smaller than block size %u", ++ i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size)); ++ return -BCH_ERR_invalid_sb_members; ++ } ++ ++ if (le16_to_cpu(m->bucket_size) < ++ BCH_SB_BTREE_NODE_SIZE(sb)) { ++ prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", ++ i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); ++ return -BCH_ERR_invalid_sb_members; ++ } ++ } ++ ++ return 0; ++} ++ ++static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_members *mi = field_to_type(f, members); ++ struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb); ++ unsigned i; ++ ++ for (i = 0; i < sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; ++ unsigned data_have = bch2_sb_dev_has_data(sb, i); ++ u64 bucket_size = le16_to_cpu(m->bucket_size); ++ u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size; ++ ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ prt_printf(out, "Device:"); ++ prt_tab(out); ++ prt_printf(out, "%u", i); ++ prt_newline(out); ++ ++ printbuf_indent_add(out, 2); ++ ++ prt_printf(out, "UUID:"); ++ prt_tab(out); ++ pr_uuid(out, m->uuid.b); ++ prt_newline(out); ++ ++ prt_printf(out, "Size:"); ++ prt_tab(out); ++ prt_units_u64(out, device_size << 9); ++ prt_newline(out); ++ ++ prt_printf(out, "Bucket size:"); ++ prt_tab(out); ++ prt_units_u64(out, bucket_size << 9); ++ prt_newline(out); ++ ++ prt_printf(out, "First bucket:"); ++ prt_tab(out); ++ prt_printf(out, "%u", le16_to_cpu(m->first_bucket)); ++ prt_newline(out); ++ ++ prt_printf(out, "Buckets:"); ++ prt_tab(out); ++ prt_printf(out, "%llu", le64_to_cpu(m->nbuckets)); ++ prt_newline(out); ++ ++ prt_printf(out, "Last mount:"); ++ prt_tab(out); ++ if (m->last_mount) ++ pr_time(out, le64_to_cpu(m->last_mount)); ++ else ++ prt_printf(out, "(never)"); ++ prt_newline(out); ++ ++ prt_printf(out, "State:"); ++ prt_tab(out); ++ prt_printf(out, "%s", ++ BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR ++ ? bch2_member_states[BCH_MEMBER_STATE(m)] ++ : "unknown"); ++ prt_newline(out); ++ ++ prt_printf(out, "Label:"); ++ prt_tab(out); ++ if (BCH_MEMBER_GROUP(m)) { ++ unsigned idx = BCH_MEMBER_GROUP(m) - 1; ++ ++ if (idx < disk_groups_nr(gi)) ++ prt_printf(out, "%s (%u)", ++ gi->entries[idx].label, idx); ++ else ++ prt_printf(out, "(bad disk labels section)"); ++ } else { ++ prt_printf(out, "(none)"); ++ } ++ prt_newline(out); ++ ++ prt_printf(out, "Data allowed:"); ++ prt_tab(out); ++ if (BCH_MEMBER_DATA_ALLOWED(m)) ++ prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m)); ++ else ++ prt_printf(out, "(none)"); ++ prt_newline(out); ++ ++ prt_printf(out, "Has data:"); ++ prt_tab(out); ++ if (data_have) ++ prt_bitflags(out, bch2_data_types, data_have); ++ else ++ prt_printf(out, "(none)"); ++ prt_newline(out); ++ ++ prt_printf(out, "Discard:"); ++ prt_tab(out); ++ prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m)); ++ prt_newline(out); ++ ++ prt_printf(out, "Freespace initialized:"); ++ prt_tab(out); ++ prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m)); ++ prt_newline(out); ++ ++ printbuf_indent_sub(out, 2); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_members = { ++ .validate = bch2_sb_members_validate, ++ .to_text = bch2_sb_members_to_text, ++}; +diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h +new file mode 100644 +index 000000000..34e1cf604 +--- /dev/null ++++ b/fs/bcachefs/sb-members.h +@@ -0,0 +1,176 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SB_MEMBERS_H ++#define _BCACHEFS_SB_MEMBERS_H ++ ++static inline bool bch2_dev_is_online(struct bch_dev *ca) ++{ ++ return !percpu_ref_is_zero(&ca->io_ref); ++} ++ ++static inline bool bch2_dev_is_readable(struct bch_dev *ca) ++{ ++ return bch2_dev_is_online(ca) && ++ ca->mi.state != BCH_MEMBER_STATE_failed; ++} ++ ++static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) ++{ ++ if (!percpu_ref_tryget(&ca->io_ref)) ++ return false; ++ ++ if (ca->mi.state == BCH_MEMBER_STATE_rw || ++ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) ++ return true; ++ ++ percpu_ref_put(&ca->io_ref); ++ return false; ++} ++ ++static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) ++{ ++ return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); ++} ++ ++static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, ++ unsigned dev) ++{ ++ unsigned i; ++ ++ for (i = 0; i < devs.nr; i++) ++ if (devs.devs[i] == dev) ++ return true; ++ ++ return false; ++} ++ ++static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, ++ unsigned dev) ++{ ++ unsigned i; ++ ++ for (i = 0; i < devs->nr; i++) ++ if (devs->devs[i] == dev) { ++ array_remove_item(devs->devs, devs->nr, i); ++ return; ++ } ++} ++ ++static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, ++ unsigned dev) ++{ ++ if (!bch2_dev_list_has_dev(*devs, dev)) { ++ BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); ++ devs->devs[devs->nr++] = dev; ++ } ++} ++ ++static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) ++{ ++ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; ++} ++ ++static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, ++ const struct bch_devs_mask *mask) ++{ ++ struct bch_dev *ca = NULL; ++ ++ while ((*iter = mask ++ ? find_next_bit(mask->d, c->sb.nr_devices, *iter) ++ : *iter) < c->sb.nr_devices && ++ !(ca = rcu_dereference_check(c->devs[*iter], ++ lockdep_is_held(&c->state_lock)))) ++ (*iter)++; ++ ++ return ca; ++} ++ ++#define for_each_member_device_rcu(ca, c, iter, mask) \ ++ for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) ++ ++static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) ++{ ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ if ((ca = __bch2_next_dev(c, iter, NULL))) ++ percpu_ref_get(&ca->ref); ++ rcu_read_unlock(); ++ ++ return ca; ++} ++ ++/* ++ * If you break early, you must drop your ref on the current device ++ */ ++#define for_each_member_device(ca, c, iter) \ ++ for ((iter) = 0; \ ++ (ca = bch2_get_next_dev(c, &(iter))); \ ++ percpu_ref_put(&ca->ref), (iter)++) ++ ++static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, ++ unsigned *iter, ++ int state_mask) ++{ ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ while ((ca = __bch2_next_dev(c, iter, NULL)) && ++ (!((1 << ca->mi.state) & state_mask) || ++ !percpu_ref_tryget(&ca->io_ref))) ++ (*iter)++; ++ rcu_read_unlock(); ++ ++ return ca; ++} ++ ++#define __for_each_online_member(ca, c, iter, state_mask) \ ++ for ((iter) = 0; \ ++ (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ ++ percpu_ref_put(&ca->io_ref), (iter)++) ++ ++#define for_each_online_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, ~0) ++ ++#define for_each_rw_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) ++ ++#define for_each_readable_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, \ ++ (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) ++ ++/* ++ * If a key exists that references a device, the device won't be going away and ++ * we can omit rcu_read_lock(): ++ */ ++static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) ++{ ++ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); ++ ++ return rcu_dereference_check(c->devs[idx], 1); ++} ++ ++static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) ++{ ++ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); ++ ++ return rcu_dereference_protected(c->devs[idx], ++ lockdep_is_held(&c->sb_lock) || ++ lockdep_is_held(&c->state_lock)); ++} ++ ++/* XXX kill, move to struct bch_fs */ ++static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) ++{ ++ struct bch_devs_mask devs; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ memset(&devs, 0, sizeof(devs)); ++ for_each_online_member(ca, c, i) ++ __set_bit(ca->dev_idx, devs.d); ++ return devs; ++} ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_members; ++ ++#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h new file mode 100644 index 000000000..c1860d816 @@ -80434,6 +81553,3295 @@ index 000000000..3dfaf34a4 +#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) + +#endif /* _SIPHASH_H_ */ +diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c +new file mode 100644 +index 000000000..14cffa68d +--- /dev/null ++++ b/fs/bcachefs/six.c +@@ -0,0 +1,918 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "six.h" ++ ++#ifdef DEBUG ++#define EBUG_ON(cond) BUG_ON(cond) ++#else ++#define EBUG_ON(cond) do {} while (0) ++#endif ++ ++#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip) ++#define six_release(l, ip) lock_release(l, ip) ++ ++static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); ++ ++#define SIX_LOCK_HELD_read_OFFSET 0 ++#define SIX_LOCK_HELD_read ~(~0U << 26) ++#define SIX_LOCK_HELD_intent (1U << 26) ++#define SIX_LOCK_HELD_write (1U << 27) ++#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read)) ++#define SIX_LOCK_WAITING_intent (1U << (28 + SIX_LOCK_intent)) ++#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write)) ++#define SIX_LOCK_NOSPIN (1U << 31) ++ ++struct six_lock_vals { ++ /* Value we add to the lock in order to take the lock: */ ++ u32 lock_val; ++ ++ /* If the lock has this value (used as a mask), taking the lock fails: */ ++ u32 lock_fail; ++ ++ /* Mask that indicates lock is held for this type: */ ++ u32 held_mask; ++ ++ /* Waitlist we wakeup when releasing the lock: */ ++ enum six_lock_type unlock_wakeup; ++}; ++ ++static const struct six_lock_vals l[] = { ++ [SIX_LOCK_read] = { ++ .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET, ++ .lock_fail = SIX_LOCK_HELD_write, ++ .held_mask = SIX_LOCK_HELD_read, ++ .unlock_wakeup = SIX_LOCK_write, ++ }, ++ [SIX_LOCK_intent] = { ++ .lock_val = SIX_LOCK_HELD_intent, ++ .lock_fail = SIX_LOCK_HELD_intent, ++ .held_mask = SIX_LOCK_HELD_intent, ++ .unlock_wakeup = SIX_LOCK_intent, ++ }, ++ [SIX_LOCK_write] = { ++ .lock_val = SIX_LOCK_HELD_write, ++ .lock_fail = SIX_LOCK_HELD_read, ++ .held_mask = SIX_LOCK_HELD_write, ++ .unlock_wakeup = SIX_LOCK_read, ++ }, ++}; ++ ++static inline void six_set_bitmask(struct six_lock *lock, u32 mask) ++{ ++ if ((atomic_read(&lock->state) & mask) != mask) ++ atomic_or(mask, &lock->state); ++} ++ ++static inline void six_clear_bitmask(struct six_lock *lock, u32 mask) ++{ ++ if (atomic_read(&lock->state) & mask) ++ atomic_and(~mask, &lock->state); ++} ++ ++static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, ++ u32 old, struct task_struct *owner) ++{ ++ if (type != SIX_LOCK_intent) ++ return; ++ ++ if (!(old & SIX_LOCK_HELD_intent)) { ++ EBUG_ON(lock->owner); ++ lock->owner = owner; ++ } else { ++ EBUG_ON(lock->owner != current); ++ } ++} ++ ++static inline unsigned pcpu_read_count(struct six_lock *lock) ++{ ++ unsigned read_count = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ read_count += *per_cpu_ptr(lock->readers, cpu); ++ return read_count; ++} ++ ++/* ++ * __do_six_trylock() - main trylock routine ++ * ++ * Returns 1 on success, 0 on failure ++ * ++ * In percpu reader mode, a failed trylock may cause a spurious trylock failure ++ * for anoter thread taking the competing lock type, and we may havve to do a ++ * wakeup: when a wakeup is required, we return -1 - wakeup_type. ++ */ ++static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, ++ struct task_struct *task, bool try) ++{ ++ int ret; ++ u32 old; ++ ++ EBUG_ON(type == SIX_LOCK_write && lock->owner != task); ++ EBUG_ON(type == SIX_LOCK_write && ++ (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write))); ++ ++ /* ++ * Percpu reader mode: ++ * ++ * The basic idea behind this algorithm is that you can implement a lock ++ * between two threads without any atomics, just memory barriers: ++ * ++ * For two threads you'll need two variables, one variable for "thread a ++ * has the lock" and another for "thread b has the lock". ++ * ++ * To take the lock, a thread sets its variable indicating that it holds ++ * the lock, then issues a full memory barrier, then reads from the ++ * other thread's variable to check if the other thread thinks it has ++ * the lock. If we raced, we backoff and retry/sleep. ++ * ++ * Failure to take the lock may cause a spurious trylock failure in ++ * another thread, because we temporarily set the lock to indicate that ++ * we held it. This would be a problem for a thread in six_lock(), when ++ * they are calling trylock after adding themself to the waitlist and ++ * prior to sleeping. ++ * ++ * Therefore, if we fail to get the lock, and there were waiters of the ++ * type we conflict with, we will have to issue a wakeup. ++ * ++ * Since we may be called under wait_lock (and by the wakeup code ++ * itself), we return that the wakeup has to be done instead of doing it ++ * here. ++ */ ++ if (type == SIX_LOCK_read && lock->readers) { ++ preempt_disable(); ++ this_cpu_inc(*lock->readers); /* signal that we own lock */ ++ ++ smp_mb(); ++ ++ old = atomic_read(&lock->state); ++ ret = !(old & l[type].lock_fail); ++ ++ this_cpu_sub(*lock->readers, !ret); ++ preempt_enable(); ++ ++ if (!ret && (old & SIX_LOCK_WAITING_write)) ++ ret = -1 - SIX_LOCK_write; ++ } else if (type == SIX_LOCK_write && lock->readers) { ++ if (try) { ++ atomic_add(SIX_LOCK_HELD_write, &lock->state); ++ smp_mb__after_atomic(); ++ } ++ ++ ret = !pcpu_read_count(lock); ++ ++ if (try && !ret) { ++ old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state); ++ if (old & SIX_LOCK_WAITING_read) ++ ret = -1 - SIX_LOCK_read; ++ } ++ } else { ++ old = atomic_read(&lock->state); ++ do { ++ ret = !(old & l[type].lock_fail); ++ if (!ret || (type == SIX_LOCK_write && !try)) { ++ smp_mb(); ++ break; ++ } ++ } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val)); ++ ++ EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask)); ++ } ++ ++ if (ret > 0) ++ six_set_owner(lock, type, old, task); ++ ++ EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 && ++ (atomic_read(&lock->state) & SIX_LOCK_HELD_write)); ++ ++ return ret; ++} ++ ++static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) ++{ ++ struct six_lock_waiter *w, *next; ++ struct task_struct *task; ++ bool saw_one; ++ int ret; ++again: ++ ret = 0; ++ saw_one = false; ++ raw_spin_lock(&lock->wait_lock); ++ ++ list_for_each_entry_safe(w, next, &lock->wait_list, list) { ++ if (w->lock_want != lock_type) ++ continue; ++ ++ if (saw_one && lock_type != SIX_LOCK_read) ++ goto unlock; ++ saw_one = true; ++ ++ ret = __do_six_trylock(lock, lock_type, w->task, false); ++ if (ret <= 0) ++ goto unlock; ++ ++ /* ++ * Similar to percpu_rwsem_wake_function(), we need to guard ++ * against the wakee noticing w->lock_acquired, returning, and ++ * then exiting before we do the wakeup: ++ */ ++ task = get_task_struct(w->task); ++ __list_del(w->list.prev, w->list.next); ++ /* ++ * The release barrier here ensures the ordering of the ++ * __list_del before setting w->lock_acquired; @w is on the ++ * stack of the thread doing the waiting and will be reused ++ * after it sees w->lock_acquired with no other locking: ++ * pairs with smp_load_acquire() in six_lock_slowpath() ++ */ ++ smp_store_release(&w->lock_acquired, true); ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++ ++ six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type); ++unlock: ++ raw_spin_unlock(&lock->wait_lock); ++ ++ if (ret < 0) { ++ lock_type = -ret - 1; ++ goto again; ++ } ++} ++ ++__always_inline ++static void six_lock_wakeup(struct six_lock *lock, u32 state, ++ enum six_lock_type lock_type) ++{ ++ if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read)) ++ return; ++ ++ if (!(state & (SIX_LOCK_WAITING_read << lock_type))) ++ return; ++ ++ __six_lock_wakeup(lock, lock_type); ++} ++ ++__always_inline ++static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try) ++{ ++ int ret; ++ ++ ret = __do_six_trylock(lock, type, current, try); ++ if (ret < 0) ++ __six_lock_wakeup(lock, -ret - 1); ++ ++ return ret > 0; ++} ++ ++/** ++ * six_trylock_ip - attempt to take a six lock without blocking ++ * @lock: lock to take ++ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write ++ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ ++ * ++ * Return: true on success, false on failure. ++ */ ++bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) ++{ ++ if (!do_six_trylock(lock, type, true)) ++ return false; ++ ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); ++ return true; ++} ++EXPORT_SYMBOL_GPL(six_trylock_ip); ++ ++/** ++ * six_relock_ip - attempt to re-take a lock that was held previously ++ * @lock: lock to take ++ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write ++ * @seq: lock sequence number obtained from six_lock_seq() while lock was ++ * held previously ++ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ ++ * ++ * Return: true on success, false on failure. ++ */ ++bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, ++ unsigned seq, unsigned long ip) ++{ ++ if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip)) ++ return false; ++ ++ if (six_lock_seq(lock) != seq) { ++ six_unlock_ip(lock, type, ip); ++ return false; ++ } ++ ++ return true; ++} ++EXPORT_SYMBOL_GPL(six_relock_ip); ++ ++#ifdef CONFIG_LOCK_SPIN_ON_OWNER ++ ++static inline bool six_can_spin_on_owner(struct six_lock *lock) ++{ ++ struct task_struct *owner; ++ bool ret; ++ ++ if (need_resched()) ++ return false; ++ ++ rcu_read_lock(); ++ owner = READ_ONCE(lock->owner); ++ ret = !owner || owner_on_cpu(owner); ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++static inline bool six_spin_on_owner(struct six_lock *lock, ++ struct task_struct *owner, ++ u64 end_time) ++{ ++ bool ret = true; ++ unsigned loop = 0; ++ ++ rcu_read_lock(); ++ while (lock->owner == owner) { ++ /* ++ * Ensure we emit the owner->on_cpu, dereference _after_ ++ * checking lock->owner still matches owner. If that fails, ++ * owner might point to freed memory. If it still matches, ++ * the rcu_read_lock() ensures the memory stays valid. ++ */ ++ barrier(); ++ ++ if (!owner_on_cpu(owner) || need_resched()) { ++ ret = false; ++ break; ++ } ++ ++ if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { ++ six_set_bitmask(lock, SIX_LOCK_NOSPIN); ++ ret = false; ++ break; ++ } ++ ++ cpu_relax(); ++ } ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) ++{ ++ struct task_struct *task = current; ++ u64 end_time; ++ ++ if (type == SIX_LOCK_write) ++ return false; ++ ++ preempt_disable(); ++ if (!six_can_spin_on_owner(lock)) ++ goto fail; ++ ++ if (!osq_lock(&lock->osq)) ++ goto fail; ++ ++ end_time = sched_clock() + 10 * NSEC_PER_USEC; ++ ++ while (1) { ++ struct task_struct *owner; ++ ++ /* ++ * If there's an owner, wait for it to either ++ * release the lock or go to sleep. ++ */ ++ owner = READ_ONCE(lock->owner); ++ if (owner && !six_spin_on_owner(lock, owner, end_time)) ++ break; ++ ++ if (do_six_trylock(lock, type, false)) { ++ osq_unlock(&lock->osq); ++ preempt_enable(); ++ return true; ++ } ++ ++ /* ++ * When there's no owner, we might have preempted between the ++ * owner acquiring the lock and setting the owner field. If ++ * we're an RT task that will live-lock because we won't let ++ * the owner complete. ++ */ ++ if (!owner && (need_resched() || rt_task(task))) ++ break; ++ ++ /* ++ * The cpu_relax() call is a compiler barrier which forces ++ * everything in this loop to be re-loaded. We don't need ++ * memory barriers as we'll eventually observe the right ++ * values at the cost of a few extra spins. ++ */ ++ cpu_relax(); ++ } ++ ++ osq_unlock(&lock->osq); ++fail: ++ preempt_enable(); ++ ++ /* ++ * If we fell out of the spin path because of need_resched(), ++ * reschedule now, before we try-lock again. This avoids getting ++ * scheduled out right after we obtained the lock. ++ */ ++ if (need_resched()) ++ schedule(); ++ ++ return false; ++} ++ ++#else /* CONFIG_LOCK_SPIN_ON_OWNER */ ++ ++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) ++{ ++ return false; ++} ++ ++#endif ++ ++noinline ++static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, ++ struct six_lock_waiter *wait, ++ six_lock_should_sleep_fn should_sleep_fn, void *p, ++ unsigned long ip) ++{ ++ int ret = 0; ++ ++ if (type == SIX_LOCK_write) { ++ EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write); ++ atomic_add(SIX_LOCK_HELD_write, &lock->state); ++ smp_mb__after_atomic(); ++ } ++ ++ trace_contention_begin(lock, 0); ++ lock_contended(&lock->dep_map, ip); ++ ++ if (six_optimistic_spin(lock, type)) ++ goto out; ++ ++ wait->task = current; ++ wait->lock_want = type; ++ wait->lock_acquired = false; ++ ++ raw_spin_lock(&lock->wait_lock); ++ six_set_bitmask(lock, SIX_LOCK_WAITING_read << type); ++ /* ++ * Retry taking the lock after taking waitlist lock, in case we raced ++ * with an unlock: ++ */ ++ ret = __do_six_trylock(lock, type, current, false); ++ if (ret <= 0) { ++ wait->start_time = local_clock(); ++ ++ if (!list_empty(&lock->wait_list)) { ++ struct six_lock_waiter *last = ++ list_last_entry(&lock->wait_list, ++ struct six_lock_waiter, list); ++ ++ if (time_before_eq64(wait->start_time, last->start_time)) ++ wait->start_time = last->start_time + 1; ++ } ++ ++ list_add_tail(&wait->list, &lock->wait_list); ++ } ++ raw_spin_unlock(&lock->wait_lock); ++ ++ if (unlikely(ret > 0)) { ++ ret = 0; ++ goto out; ++ } ++ ++ if (unlikely(ret < 0)) { ++ __six_lock_wakeup(lock, -ret - 1); ++ ret = 0; ++ } ++ ++ while (1) { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ ++ /* ++ * Ensures that writes to the waitlist entry happen after we see ++ * wait->lock_acquired: pairs with the smp_store_release in ++ * __six_lock_wakeup ++ */ ++ if (smp_load_acquire(&wait->lock_acquired)) ++ break; ++ ++ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; ++ if (unlikely(ret)) { ++ bool acquired; ++ ++ /* ++ * If should_sleep_fn() returns an error, we are ++ * required to return that error even if we already ++ * acquired the lock - should_sleep_fn() might have ++ * modified external state (e.g. when the deadlock cycle ++ * detector in bcachefs issued a transaction restart) ++ */ ++ raw_spin_lock(&lock->wait_lock); ++ acquired = wait->lock_acquired; ++ if (!acquired) ++ list_del(&wait->list); ++ raw_spin_unlock(&lock->wait_lock); ++ ++ if (unlikely(acquired)) ++ do_six_unlock_type(lock, type); ++ break; ++ } ++ ++ schedule(); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++out: ++ if (ret && type == SIX_LOCK_write) { ++ six_clear_bitmask(lock, SIX_LOCK_HELD_write); ++ six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); ++ } ++ trace_contention_end(lock, 0); ++ ++ return ret; ++} ++ ++/** ++ * six_lock_ip_waiter - take a lock, with full waitlist interface ++ * @lock: lock to take ++ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write ++ * @wait: pointer to wait object, which will be added to lock's waitlist ++ * @should_sleep_fn: callback run after adding to waitlist, immediately prior ++ * to scheduling ++ * @p: passed through to @should_sleep_fn ++ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ ++ * ++ * This is the most general six_lock() variant, with parameters to support full ++ * cycle detection for deadlock avoidance. ++ * ++ * The code calling this function must implement tracking of held locks, and the ++ * @wait object should be embedded into the struct that tracks held locks - ++ * which must also be accessible in a thread-safe way. ++ * ++ * @should_sleep_fn should invoke the cycle detector; it should walk each ++ * lock's waiters, and for each waiter recursively walk their held locks. ++ * ++ * When this function must block, @wait will be added to @lock's waitlist before ++ * calling trylock, and before calling @should_sleep_fn, and @wait will not be ++ * removed from the lock waitlist until the lock has been successfully acquired, ++ * or we abort. ++ * ++ * @wait.start_time will be monotonically increasing for any given waitlist, and ++ * thus may be used as a loop cursor. ++ * ++ * Return: 0 on success, or the return code from @should_sleep_fn on failure. ++ */ ++int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, ++ struct six_lock_waiter *wait, ++ six_lock_should_sleep_fn should_sleep_fn, void *p, ++ unsigned long ip) ++{ ++ int ret; ++ ++ wait->start_time = 0; ++ ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip); ++ ++ ret = do_six_trylock(lock, type, true) ? 0 ++ : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip); ++ ++ if (ret && type != SIX_LOCK_write) ++ six_release(&lock->dep_map, ip); ++ if (!ret) ++ lock_acquired(&lock->dep_map, ip); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(six_lock_ip_waiter); ++ ++__always_inline ++static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ u32 state; ++ ++ if (type == SIX_LOCK_intent) ++ lock->owner = NULL; ++ ++ if (type == SIX_LOCK_read && ++ lock->readers) { ++ smp_mb(); /* unlock barrier */ ++ this_cpu_dec(*lock->readers); ++ smp_mb(); /* between unlocking and checking for waiters */ ++ state = atomic_read(&lock->state); ++ } else { ++ u32 v = l[type].lock_val; ++ ++ if (type != SIX_LOCK_read) ++ v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN; ++ ++ EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask)); ++ state = atomic_sub_return_release(v, &lock->state); ++ } ++ ++ six_lock_wakeup(lock, state, l[type].unlock_wakeup); ++} ++ ++/** ++ * six_unlock_ip - drop a six lock ++ * @lock: lock to unlock ++ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write ++ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ ++ * ++ * When a lock is held multiple times (because six_lock_incement()) was used), ++ * this decrements the 'lock held' counter by one. ++ * ++ * For example: ++ * six_lock_read(&foo->lock); read count 1 ++ * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 ++ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 ++ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 ++ */ ++void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) ++{ ++ EBUG_ON(type == SIX_LOCK_write && ++ !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); ++ EBUG_ON((type == SIX_LOCK_write || ++ type == SIX_LOCK_intent) && ++ lock->owner != current); ++ ++ if (type != SIX_LOCK_write) ++ six_release(&lock->dep_map, ip); ++ else ++ lock->seq++; ++ ++ if (type == SIX_LOCK_intent && ++ lock->intent_lock_recurse) { ++ --lock->intent_lock_recurse; ++ return; ++ } ++ ++ do_six_unlock_type(lock, type); ++} ++EXPORT_SYMBOL_GPL(six_unlock_ip); ++ ++/** ++ * six_lock_downgrade - convert an intent lock to a read lock ++ * @lock: lock to dowgrade ++ * ++ * @lock will have read count incremented and intent count decremented ++ */ ++void six_lock_downgrade(struct six_lock *lock) ++{ ++ six_lock_increment(lock, SIX_LOCK_read); ++ six_unlock_intent(lock); ++} ++EXPORT_SYMBOL_GPL(six_lock_downgrade); ++ ++/** ++ * six_lock_tryupgrade - attempt to convert read lock to an intent lock ++ * @lock: lock to upgrade ++ * ++ * On success, @lock will have intent count incremented and read count ++ * decremented ++ * ++ * Return: true on success, false on failure ++ */ ++bool six_lock_tryupgrade(struct six_lock *lock) ++{ ++ u32 old = atomic_read(&lock->state), new; ++ ++ do { ++ new = old; ++ ++ if (new & SIX_LOCK_HELD_intent) ++ return false; ++ ++ if (!lock->readers) { ++ EBUG_ON(!(new & SIX_LOCK_HELD_read)); ++ new -= l[SIX_LOCK_read].lock_val; ++ } ++ ++ new |= SIX_LOCK_HELD_intent; ++ } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new)); ++ ++ if (lock->readers) ++ this_cpu_dec(*lock->readers); ++ ++ six_set_owner(lock, SIX_LOCK_intent, old, current); ++ ++ return true; ++} ++EXPORT_SYMBOL_GPL(six_lock_tryupgrade); ++ ++/** ++ * six_trylock_convert - attempt to convert a held lock from one type to another ++ * @lock: lock to upgrade ++ * @from: SIX_LOCK_read or SIX_LOCK_intent ++ * @to: SIX_LOCK_read or SIX_LOCK_intent ++ * ++ * On success, @lock will have intent count incremented and read count ++ * decremented ++ * ++ * Return: true on success, false on failure ++ */ ++bool six_trylock_convert(struct six_lock *lock, ++ enum six_lock_type from, ++ enum six_lock_type to) ++{ ++ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); ++ ++ if (to == from) ++ return true; ++ ++ if (to == SIX_LOCK_read) { ++ six_lock_downgrade(lock); ++ return true; ++ } else { ++ return six_lock_tryupgrade(lock); ++ } ++} ++EXPORT_SYMBOL_GPL(six_trylock_convert); ++ ++/** ++ * six_lock_increment - increase held lock count on a lock that is already held ++ * @lock: lock to increment ++ * @type: SIX_LOCK_read or SIX_LOCK_intent ++ * ++ * @lock must already be held, with a lock type that is greater than or equal to ++ * @type ++ * ++ * A corresponding six_unlock_type() call will be required for @lock to be fully ++ * unlocked. ++ */ ++void six_lock_increment(struct six_lock *lock, enum six_lock_type type) ++{ ++ six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_); ++ ++ /* XXX: assert already locked, and that we don't overflow: */ ++ ++ switch (type) { ++ case SIX_LOCK_read: ++ if (lock->readers) { ++ this_cpu_inc(*lock->readers); ++ } else { ++ EBUG_ON(!(atomic_read(&lock->state) & ++ (SIX_LOCK_HELD_read| ++ SIX_LOCK_HELD_intent))); ++ atomic_add(l[type].lock_val, &lock->state); ++ } ++ break; ++ case SIX_LOCK_intent: ++ EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); ++ lock->intent_lock_recurse++; ++ break; ++ case SIX_LOCK_write: ++ BUG(); ++ break; ++ } ++} ++EXPORT_SYMBOL_GPL(six_lock_increment); ++ ++/** ++ * six_lock_wakeup_all - wake up all waiters on @lock ++ * @lock: lock to wake up waiters for ++ * ++ * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then ++ * abort the lock operation. ++ * ++ * This function is never needed in a bug-free program; it's only useful in ++ * debug code, e.g. to determine if a cycle detector is at fault. ++ */ ++void six_lock_wakeup_all(struct six_lock *lock) ++{ ++ u32 state = atomic_read(&lock->state); ++ struct six_lock_waiter *w; ++ ++ six_lock_wakeup(lock, state, SIX_LOCK_read); ++ six_lock_wakeup(lock, state, SIX_LOCK_intent); ++ six_lock_wakeup(lock, state, SIX_LOCK_write); ++ ++ raw_spin_lock(&lock->wait_lock); ++ list_for_each_entry(w, &lock->wait_list, list) ++ wake_up_process(w->task); ++ raw_spin_unlock(&lock->wait_lock); ++} ++EXPORT_SYMBOL_GPL(six_lock_wakeup_all); ++ ++/** ++ * six_lock_counts - return held lock counts, for each lock type ++ * @lock: lock to return counters for ++ * ++ * Return: the number of times a lock is held for read, intent and write. ++ */ ++struct six_lock_count six_lock_counts(struct six_lock *lock) ++{ ++ struct six_lock_count ret; ++ ++ ret.n[SIX_LOCK_read] = !lock->readers ++ ? atomic_read(&lock->state) & SIX_LOCK_HELD_read ++ : pcpu_read_count(lock); ++ ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) + ++ lock->intent_lock_recurse; ++ ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(six_lock_counts); ++ ++/** ++ * six_lock_readers_add - directly manipulate reader count of a lock ++ * @lock: lock to add/subtract readers for ++ * @nr: reader count to add/subtract ++ * ++ * When an upper layer is implementing lock reentrency, we may have both read ++ * and intent locks on the same lock. ++ * ++ * When we need to take a write lock, the read locks will cause self-deadlock, ++ * because six locks themselves do not track which read locks are held by the ++ * current thread and which are held by a different thread - it does no ++ * per-thread tracking of held locks. ++ * ++ * The upper layer that is tracking held locks may however, if trylock() has ++ * failed, count up its own read locks, subtract them, take the write lock, and ++ * then re-add them. ++ * ++ * As in any other situation when taking a write lock, @lock must be held for ++ * intent one (or more) times, so @lock will never be left unlocked. ++ */ ++void six_lock_readers_add(struct six_lock *lock, int nr) ++{ ++ if (lock->readers) { ++ this_cpu_add(*lock->readers, nr); ++ } else { ++ EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0); ++ /* reader count starts at bit 0 */ ++ atomic_add(nr, &lock->state); ++ } ++} ++EXPORT_SYMBOL_GPL(six_lock_readers_add); ++ ++/** ++ * six_lock_exit - release resources held by a lock prior to freeing ++ * @lock: lock to exit ++ * ++ * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is ++ * required to free the percpu read counts. ++ */ ++void six_lock_exit(struct six_lock *lock) ++{ ++ WARN_ON(lock->readers && pcpu_read_count(lock)); ++ WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read); ++ ++ free_percpu(lock->readers); ++ lock->readers = NULL; ++} ++EXPORT_SYMBOL_GPL(six_lock_exit); ++ ++void __six_lock_init(struct six_lock *lock, const char *name, ++ struct lock_class_key *key, enum six_lock_init_flags flags) ++{ ++ atomic_set(&lock->state, 0); ++ raw_spin_lock_init(&lock->wait_lock); ++ INIT_LIST_HEAD(&lock->wait_list); ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ debug_check_no_locks_freed((void *) lock, sizeof(*lock)); ++ lockdep_init_map(&lock->dep_map, name, key, 0); ++#endif ++ ++ /* ++ * Don't assume that we have real percpu variables available in ++ * userspace: ++ */ ++#ifdef __KERNEL__ ++ if (flags & SIX_LOCK_INIT_PCPU) { ++ /* ++ * We don't return an error here on memory allocation failure ++ * since percpu is an optimization, and locks will work with the ++ * same semantics in non-percpu mode: callers can check for ++ * failure if they wish by checking lock->readers, but generally ++ * will not want to treat it as an error. ++ */ ++ lock->readers = alloc_percpu(unsigned); ++ } ++#endif ++} ++EXPORT_SYMBOL_GPL(__six_lock_init); +diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h +new file mode 100644 +index 000000000..394da423c +--- /dev/null ++++ b/fs/bcachefs/six.h +@@ -0,0 +1,388 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _LINUX_SIX_H ++#define _LINUX_SIX_H ++ ++/** ++ * DOC: SIX locks overview ++ * ++ * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores ++ * but with an additional state: read/shared, intent, exclusive/write ++ * ++ * The purpose of the intent state is to allow for greater concurrency on tree ++ * structures without deadlocking. In general, a read can't be upgraded to a ++ * write lock without deadlocking, so an operation that updates multiple nodes ++ * will have to take write locks for the full duration of the operation. ++ * ++ * But by adding an intent state, which is exclusive with other intent locks but ++ * not with readers, we can take intent locks at thte start of the operation, ++ * and then take write locks only for the actual update to each individual ++ * nodes, without deadlocking. ++ * ++ * Example usage: ++ * six_lock_read(&foo->lock); ++ * six_unlock_read(&foo->lock); ++ * ++ * An intent lock must be held before taking a write lock: ++ * six_lock_intent(&foo->lock); ++ * six_lock_write(&foo->lock); ++ * six_unlock_write(&foo->lock); ++ * six_unlock_intent(&foo->lock); ++ * ++ * Other operations: ++ * six_trylock_read() ++ * six_trylock_intent() ++ * six_trylock_write() ++ * ++ * six_lock_downgrade() convert from intent to read ++ * six_lock_tryupgrade() attempt to convert from read to intent, may fail ++ * ++ * There are also interfaces that take the lock type as an enum: ++ * ++ * six_lock_type(&foo->lock, SIX_LOCK_read); ++ * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent) ++ * six_lock_type(&foo->lock, SIX_LOCK_write); ++ * six_unlock_type(&foo->lock, SIX_LOCK_write); ++ * six_unlock_type(&foo->lock, SIX_LOCK_intent); ++ * ++ * Lock sequence numbers - unlock(), relock(): ++ * ++ * Locks embed sequences numbers, which are incremented on write lock/unlock. ++ * This allows locks to be dropped and the retaken iff the state they protect ++ * hasn't changed; this makes it much easier to avoid holding locks while e.g. ++ * doing IO or allocating memory. ++ * ++ * Example usage: ++ * six_lock_read(&foo->lock); ++ * u32 seq = six_lock_seq(&foo->lock); ++ * six_unlock_read(&foo->lock); ++ * ++ * some_operation_that_may_block(); ++ * ++ * if (six_relock_read(&foo->lock, seq)) { ... } ++ * ++ * If the relock operation succeeds, it is as if the lock was never unlocked. ++ * ++ * Reentrancy: ++ * ++ * Six locks are not by themselves reentrent, but have counters for both the ++ * read and intent states that can be used to provide reentrency by an upper ++ * layer that tracks held locks. If a lock is known to already be held in the ++ * read or intent state, six_lock_increment() can be used to bump the "lock ++ * held in this state" counter, increasing the number of unlock calls that ++ * will be required to fully unlock it. ++ * ++ * Example usage: ++ * six_lock_read(&foo->lock); ++ * six_lock_increment(&foo->lock, SIX_LOCK_read); ++ * six_unlock_read(&foo->lock); ++ * six_unlock_read(&foo->lock); ++ * foo->lock is now fully unlocked. ++ * ++ * Since the intent state supercedes read, it's legal to increment the read ++ * counter when holding an intent lock, but not the reverse. ++ * ++ * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write) ++ * is not legal. ++ * ++ * should_sleep_fn: ++ * ++ * There is a six_lock() variant that takes a function pointer that is called ++ * immediately prior to schedule() when blocking, and may return an error to ++ * abort. ++ * ++ * One possible use for this feature is when objects being locked are part of ++ * a cache and may reused, and lock ordering is based on a property of the ++ * object that will change when the object is reused - i.e. logical key order. ++ * ++ * If looking up an object in the cache may race with object reuse, and lock ++ * ordering is required to prevent deadlock, object reuse may change the ++ * correct lock order for that object and cause a deadlock. should_sleep_fn ++ * can be used to check if the object is still the object we want and avoid ++ * this deadlock. ++ * ++ * Wait list entry interface: ++ * ++ * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a ++ * wait list entry. By embedding six_lock_waiter into another object, and by ++ * traversing lock waitlists, it is then possible for an upper layer to ++ * implement full cycle detection for deadlock avoidance. ++ * ++ * should_sleep_fn should be used for invoking the cycle detector, walking the ++ * graph of held locks to check for a deadlock. The upper layer must track ++ * held locks for each thread, and each thread's held locks must be reachable ++ * from its six_lock_waiter object. ++ * ++ * six_lock_waiter() will add the wait object to the waitlist re-trying taking ++ * the lock, and before calling should_sleep_fn, and the wait object will not ++ * be removed from the waitlist until either the lock has been successfully ++ * acquired, or we aborted because should_sleep_fn returned an error. ++ * ++ * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will ++ * have timestamps in strictly ascending order - this is so the timestamp can ++ * be used as a cursor for lock graph traverse. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++enum six_lock_type { ++ SIX_LOCK_read, ++ SIX_LOCK_intent, ++ SIX_LOCK_write, ++}; ++ ++struct six_lock { ++ atomic_t state; ++ u32 seq; ++ unsigned intent_lock_recurse; ++ struct task_struct *owner; ++ unsigned __percpu *readers; ++ struct optimistic_spin_queue osq; ++ raw_spinlock_t wait_lock; ++ struct list_head wait_list; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++}; ++ ++struct six_lock_waiter { ++ struct list_head list; ++ struct task_struct *task; ++ enum six_lock_type lock_want; ++ bool lock_acquired; ++ u64 start_time; ++}; ++ ++typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); ++ ++void six_lock_exit(struct six_lock *lock); ++ ++enum six_lock_init_flags { ++ SIX_LOCK_INIT_PCPU = 1U << 0, ++}; ++ ++void __six_lock_init(struct six_lock *lock, const char *name, ++ struct lock_class_key *key, enum six_lock_init_flags flags); ++ ++/** ++ * six_lock_init - initialize a six lock ++ * @lock: lock to initialize ++ * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU ++ */ ++#define six_lock_init(lock, flags) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __six_lock_init((lock), #lock, &__key, flags); \ ++} while (0) ++ ++/** ++ * six_lock_seq - obtain current lock sequence number ++ * @lock: six_lock to obtain sequence number for ++ * ++ * @lock should be held for read or intent, and not write ++ * ++ * By saving the lock sequence number, we can unlock @lock and then (typically ++ * after some blocking operation) attempt to relock it: the relock will succeed ++ * if the sequence number hasn't changed, meaning no write locks have been taken ++ * and state corresponding to what @lock protects is still valid. ++ */ ++static inline u32 six_lock_seq(const struct six_lock *lock) ++{ ++ return lock->seq; ++} ++ ++bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); ++ ++/** ++ * six_trylock_type - attempt to take a six lock without blocking ++ * @lock: lock to take ++ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write ++ * ++ * Return: true on success, false on failure. ++ */ ++static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ return six_trylock_ip(lock, type, _THIS_IP_); ++} ++ ++int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, ++ struct six_lock_waiter *wait, ++ six_lock_should_sleep_fn should_sleep_fn, void *p, ++ unsigned long ip); ++ ++/** ++ * six_lock_waiter - take a lock, with full waitlist interface ++ * @lock: lock to take ++ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write ++ * @wait: pointer to wait object, which will be added to lock's waitlist ++ * @should_sleep_fn: callback run after adding to waitlist, immediately prior ++ * to scheduling ++ * @p: passed through to @should_sleep_fn ++ * ++ * This is a convenience wrapper around six_lock_ip_waiter(), see that function ++ * for full documentation. ++ * ++ * Return: 0 on success, or the return code from @should_sleep_fn on failure. ++ */ ++static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type, ++ struct six_lock_waiter *wait, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_); ++} ++ ++/** ++ * six_lock_ip - take a six lock lock ++ * @lock: lock to take ++ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write ++ * @should_sleep_fn: callback run after adding to waitlist, immediately prior ++ * to scheduling ++ * @p: passed through to @should_sleep_fn ++ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ ++ * ++ * Return: 0 on success, or the return code from @should_sleep_fn on failure. ++ */ ++static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p, ++ unsigned long ip) ++{ ++ struct six_lock_waiter wait; ++ ++ return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip); ++} ++ ++/** ++ * six_lock_type - take a six lock lock ++ * @lock: lock to take ++ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write ++ * @should_sleep_fn: callback run after adding to waitlist, immediately prior ++ * to scheduling ++ * @p: passed through to @should_sleep_fn ++ * ++ * Return: 0 on success, or the return code from @should_sleep_fn on failure. ++ */ ++static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ struct six_lock_waiter wait; ++ ++ return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_); ++} ++ ++bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, ++ unsigned seq, unsigned long ip); ++ ++/** ++ * six_relock_type - attempt to re-take a lock that was held previously ++ * @lock: lock to take ++ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write ++ * @seq: lock sequence number obtained from six_lock_seq() while lock was ++ * held previously ++ * ++ * Return: true on success, false on failure. ++ */ ++static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, ++ unsigned seq) ++{ ++ return six_relock_ip(lock, type, seq, _THIS_IP_); ++} ++ ++void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); ++ ++/** ++ * six_unlock_type - drop a six lock ++ * @lock: lock to unlock ++ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write ++ * ++ * When a lock is held multiple times (because six_lock_incement()) was used), ++ * this decrements the 'lock held' counter by one. ++ * ++ * For example: ++ * six_lock_read(&foo->lock); read count 1 ++ * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 ++ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 ++ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 ++ */ ++static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ six_unlock_ip(lock, type, _THIS_IP_); ++} ++ ++#define __SIX_LOCK(type) \ ++static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\ ++{ \ ++ return six_trylock_ip(lock, SIX_LOCK_##type, ip); \ ++} \ ++ \ ++static inline bool six_trylock_##type(struct six_lock *lock) \ ++{ \ ++ return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ ++} \ ++ \ ++static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \ ++ struct six_lock_waiter *wait, \ ++ six_lock_should_sleep_fn should_sleep_fn, void *p,\ ++ unsigned long ip) \ ++{ \ ++ return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ ++} \ ++ \ ++static inline int six_lock_ip_##type(struct six_lock *lock, \ ++ six_lock_should_sleep_fn should_sleep_fn, void *p, \ ++ unsigned long ip) \ ++{ \ ++ return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ ++} \ ++ \ ++static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ ++{ \ ++ return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \ ++} \ ++ \ ++static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \ ++{ \ ++ return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \ ++} \ ++ \ ++static inline int six_lock_##type(struct six_lock *lock, \ ++ six_lock_should_sleep_fn fn, void *p)\ ++{ \ ++ return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \ ++} \ ++ \ ++static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ ++{ \ ++ six_unlock_ip(lock, SIX_LOCK_##type, ip); \ ++} \ ++ \ ++static inline void six_unlock_##type(struct six_lock *lock) \ ++{ \ ++ six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ ++} ++ ++__SIX_LOCK(read) ++__SIX_LOCK(intent) ++__SIX_LOCK(write) ++#undef __SIX_LOCK ++ ++void six_lock_downgrade(struct six_lock *); ++bool six_lock_tryupgrade(struct six_lock *); ++bool six_trylock_convert(struct six_lock *, enum six_lock_type, ++ enum six_lock_type); ++ ++void six_lock_increment(struct six_lock *, enum six_lock_type); ++ ++void six_lock_wakeup_all(struct six_lock *); ++ ++struct six_lock_count { ++ unsigned n[3]; ++}; ++ ++struct six_lock_count six_lock_counts(struct six_lock *); ++void six_lock_readers_add(struct six_lock *, int); ++ ++#endif /* _LINUX_SIX_H */ +diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c +new file mode 100644 +index 000000000..9da099114 +--- /dev/null ++++ b/fs/bcachefs/snapshot.c +@@ -0,0 +1,1687 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_buf.h" ++#include "btree_key_cache.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "errcode.h" ++#include "error.h" ++#include "fs.h" ++#include "snapshot.h" ++ ++#include ++ ++/* ++ * Snapshot trees: ++ * ++ * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they ++ * exist to provide a stable identifier for the whole lifetime of a snapshot ++ * tree. ++ */ ++ ++void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k); ++ ++ prt_printf(out, "subvol %u root snapshot %u", ++ le32_to_cpu(t.v->master_subvol), ++ le32_to_cpu(t.v->root_snapshot)); ++} ++ ++int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) ++{ ++ if (bkey_gt(k.k->p, POS(0, U32_MAX)) || ++ bkey_lt(k.k->p, POS(0, 1))) { ++ prt_printf(err, "bad pos"); ++ return -BCH_ERR_invalid_bkey; ++ } ++ ++ return 0; ++} ++ ++int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, ++ struct bch_snapshot_tree *s) ++{ ++ int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), ++ BTREE_ITER_WITH_UPDATES, snapshot_tree, s); ++ ++ if (bch2_err_matches(ret, ENOENT)) ++ ret = -BCH_ERR_ENOENT_snapshot_tree; ++ return ret; ++} ++ ++struct bkey_i_snapshot_tree * ++__bch2_snapshot_tree_create(struct btree_trans *trans) ++{ ++ struct btree_iter iter; ++ int ret = bch2_bkey_get_empty_slot(trans, &iter, ++ BTREE_ID_snapshot_trees, POS(0, U32_MAX)); ++ struct bkey_i_snapshot_tree *s_t; ++ ++ if (ret == -BCH_ERR_ENOSPC_btree_slot) ++ ret = -BCH_ERR_ENOSPC_snapshot_tree; ++ if (ret) ++ return ERR_PTR(ret); ++ ++ s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); ++ ret = PTR_ERR_OR_ZERO(s_t); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret ? ERR_PTR(ret) : s_t; ++} ++ ++static int bch2_snapshot_tree_create(struct btree_trans *trans, ++ u32 root_id, u32 subvol_id, u32 *tree_id) ++{ ++ struct bkey_i_snapshot_tree *n_tree = ++ __bch2_snapshot_tree_create(trans); ++ ++ if (IS_ERR(n_tree)) ++ return PTR_ERR(n_tree); ++ ++ n_tree->v.master_subvol = cpu_to_le32(subvol_id); ++ n_tree->v.root_snapshot = cpu_to_le32(root_id); ++ *tree_id = n_tree->k.p.offset; ++ return 0; ++} ++ ++/* Snapshot nodes: */ ++ ++static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) ++{ ++ struct snapshot_table *t; ++ ++ rcu_read_lock(); ++ t = rcu_dereference(c->snapshots); ++ ++ while (id && id < ancestor) ++ id = __snapshot_t(t, id)->parent; ++ rcu_read_unlock(); ++ ++ return id == ancestor; ++} ++ ++static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) ++{ ++ const struct snapshot_t *s = __snapshot_t(t, id); ++ ++ if (s->skip[2] <= ancestor) ++ return s->skip[2]; ++ if (s->skip[1] <= ancestor) ++ return s->skip[1]; ++ if (s->skip[0] <= ancestor) ++ return s->skip[0]; ++ return s->parent; ++} ++ ++bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) ++{ ++ struct snapshot_table *t; ++ bool ret; ++ ++ EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); ++ ++ rcu_read_lock(); ++ t = rcu_dereference(c->snapshots); ++ ++ while (id && id < ancestor - IS_ANCESTOR_BITMAP) ++ id = get_ancestor_below(t, id, ancestor); ++ ++ if (id && id < ancestor) { ++ ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor); ++ ++ EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor)); ++ } else { ++ ret = id == ancestor; ++ } ++ ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++struct snapshot_t_free_rcu { ++ struct rcu_head rcu; ++ struct snapshot_table *t; ++}; ++ ++static void snapshot_t_free_rcu(struct rcu_head *rcu) ++{ ++ struct snapshot_t_free_rcu *free_rcu = ++ container_of(rcu, struct snapshot_t_free_rcu, rcu); ++ ++ kvfree(free_rcu->t); ++ kfree(free_rcu); ++} ++ ++static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) ++{ ++ size_t idx = U32_MAX - id; ++ size_t new_size; ++ struct snapshot_table *new, *old; ++ ++ new_size = max(16UL, roundup_pow_of_two(idx + 1)); ++ ++ new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); ++ if (!new) ++ return NULL; ++ ++ old = rcu_dereference_protected(c->snapshots, true); ++ if (old) ++ memcpy(new->s, ++ rcu_dereference_protected(c->snapshots, true)->s, ++ sizeof(new->s[0]) * c->snapshot_table_size); ++ ++ rcu_assign_pointer(c->snapshots, new); ++ c->snapshot_table_size = new_size; ++ if (old) { ++ struct snapshot_t_free_rcu *rcu = ++ kmalloc(sizeof(*rcu), GFP_KERNEL|__GFP_NOFAIL); ++ ++ rcu->t = old; ++ call_rcu(&rcu->rcu, snapshot_t_free_rcu); ++ } ++ ++ return &rcu_dereference_protected(c->snapshots, true)->s[idx]; ++} ++ ++static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) ++{ ++ size_t idx = U32_MAX - id; ++ ++ lockdep_assert_held(&c->snapshot_table_lock); ++ ++ if (likely(idx < c->snapshot_table_size)) ++ return &rcu_dereference_protected(c->snapshots, true)->s[idx]; ++ ++ return __snapshot_t_mut(c, id); ++} ++ ++void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); ++ ++ prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", ++ BCH_SNAPSHOT_SUBVOL(s.v), ++ BCH_SNAPSHOT_DELETED(s.v), ++ le32_to_cpu(s.v->parent), ++ le32_to_cpu(s.v->children[0]), ++ le32_to_cpu(s.v->children[1]), ++ le32_to_cpu(s.v->subvol), ++ le32_to_cpu(s.v->tree)); ++ ++ if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth)) ++ prt_printf(out, " depth %u skiplist %u %u %u", ++ le32_to_cpu(s.v->depth), ++ le32_to_cpu(s.v->skip[0]), ++ le32_to_cpu(s.v->skip[1]), ++ le32_to_cpu(s.v->skip[2])); ++} ++ ++int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) ++{ ++ struct bkey_s_c_snapshot s; ++ u32 i, id; ++ ++ if (bkey_gt(k.k->p, POS(0, U32_MAX)) || ++ bkey_lt(k.k->p, POS(0, 1))) { ++ prt_printf(err, "bad pos"); ++ return -BCH_ERR_invalid_bkey; ++ } ++ ++ s = bkey_s_c_to_snapshot(k); ++ ++ id = le32_to_cpu(s.v->parent); ++ if (id && id <= k.k->p.offset) { ++ prt_printf(err, "bad parent node (%u <= %llu)", ++ id, k.k->p.offset); ++ return -BCH_ERR_invalid_bkey; ++ } ++ ++ if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) { ++ prt_printf(err, "children not normalized"); ++ return -BCH_ERR_invalid_bkey; ++ } ++ ++ if (s.v->children[0] && ++ s.v->children[0] == s.v->children[1]) { ++ prt_printf(err, "duplicate child nodes"); ++ return -BCH_ERR_invalid_bkey; ++ } ++ ++ for (i = 0; i < 2; i++) { ++ id = le32_to_cpu(s.v->children[i]); ++ ++ if (id >= k.k->p.offset) { ++ prt_printf(err, "bad child node (%u >= %llu)", ++ id, k.k->p.offset); ++ return -BCH_ERR_invalid_bkey; ++ } ++ } ++ ++ if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { ++ if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || ++ le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) { ++ prt_printf(err, "skiplist not normalized"); ++ return -BCH_ERR_invalid_bkey; ++ } ++ ++ for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { ++ id = le32_to_cpu(s.v->skip[i]); ++ ++ if ((id && !s.v->parent) || ++ (id && id <= k.k->p.offset)) { ++ prt_printf(err, "bad skiplist node %u", id); ++ return -BCH_ERR_invalid_bkey; ++ } ++ } ++ } ++ ++ return 0; ++} ++ ++static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) ++{ ++ struct snapshot_t *t = snapshot_t_mut(c, id); ++ u32 parent = id; ++ ++ while ((parent = bch2_snapshot_parent_early(c, parent)) && ++ parent - id - 1 < IS_ANCESTOR_BITMAP) ++ __set_bit(parent - id - 1, t->is_ancestor); ++} ++ ++static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) ++{ ++ mutex_lock(&c->snapshot_table_lock); ++ __set_is_ancestor_bitmap(c, id); ++ mutex_unlock(&c->snapshot_table_lock); ++} ++ ++int bch2_mark_snapshot(struct btree_trans *trans, ++ enum btree_id btree, unsigned level, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct snapshot_t *t; ++ u32 id = new.k->p.offset; ++ int ret = 0; ++ ++ mutex_lock(&c->snapshot_table_lock); ++ ++ t = snapshot_t_mut(c, id); ++ if (!t) { ++ ret = -BCH_ERR_ENOMEM_mark_snapshot; ++ goto err; ++ } ++ ++ if (new.k->type == KEY_TYPE_snapshot) { ++ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); ++ ++ t->parent = le32_to_cpu(s.v->parent); ++ t->children[0] = le32_to_cpu(s.v->children[0]); ++ t->children[1] = le32_to_cpu(s.v->children[1]); ++ t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; ++ t->tree = le32_to_cpu(s.v->tree); ++ ++ if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) { ++ t->depth = le32_to_cpu(s.v->depth); ++ t->skip[0] = le32_to_cpu(s.v->skip[0]); ++ t->skip[1] = le32_to_cpu(s.v->skip[1]); ++ t->skip[2] = le32_to_cpu(s.v->skip[2]); ++ } else { ++ t->depth = 0; ++ t->skip[0] = 0; ++ t->skip[1] = 0; ++ t->skip[2] = 0; ++ } ++ ++ __set_is_ancestor_bitmap(c, id); ++ ++ if (BCH_SNAPSHOT_DELETED(s.v)) { ++ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); ++ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots); ++ } ++ } else { ++ memset(t, 0, sizeof(*t)); ++ } ++err: ++ mutex_unlock(&c->snapshot_table_lock); ++ return ret; ++} ++ ++int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, ++ struct bch_snapshot *s) ++{ ++ return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), ++ BTREE_ITER_WITH_UPDATES, snapshot, s); ++} ++ ++int bch2_snapshot_live(struct btree_trans *trans, u32 id) ++{ ++ struct bch_snapshot v; ++ int ret; ++ ++ if (!id) ++ return 0; ++ ++ ret = bch2_snapshot_lookup(trans, id, &v); ++ if (bch2_err_matches(ret, ENOENT)) ++ bch_err(trans->c, "snapshot node %u not found", id); ++ if (ret) ++ return ret; ++ ++ return !BCH_SNAPSHOT_DELETED(&v); ++} ++ ++/* ++ * If @k is a snapshot with just one live child, it's part of a linear chain, ++ * which we consider to be an equivalence class: and then after snapshot ++ * deletion cleanup, there should only be a single key at a given position in ++ * this equivalence class. ++ * ++ * This sets the equivalence class of @k to be the child's equivalence class, if ++ * it's part of such a linear chain: this correctly sets equivalence classes on ++ * startup if we run leaf to root (i.e. in natural key order). ++ */ ++int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned i, nr_live = 0, live_idx = 0; ++ struct bkey_s_c_snapshot snap; ++ u32 id = k.k->p.offset, child[2]; ++ ++ if (k.k->type != KEY_TYPE_snapshot) ++ return 0; ++ ++ snap = bkey_s_c_to_snapshot(k); ++ ++ child[0] = le32_to_cpu(snap.v->children[0]); ++ child[1] = le32_to_cpu(snap.v->children[1]); ++ ++ for (i = 0; i < 2; i++) { ++ int ret = bch2_snapshot_live(trans, child[i]); ++ ++ if (ret < 0) ++ return ret; ++ ++ if (ret) ++ live_idx = i; ++ nr_live += ret; ++ } ++ ++ mutex_lock(&c->snapshot_table_lock); ++ ++ snapshot_t_mut(c, id)->equiv = nr_live == 1 ++ ? snapshot_t_mut(c, child[live_idx])->equiv ++ : id; ++ ++ mutex_unlock(&c->snapshot_table_lock); ++ ++ return 0; ++} ++ ++/* fsck: */ ++ ++static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) ++{ ++ return snapshot_t(c, id)->children[child]; ++} ++ ++static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) ++{ ++ return bch2_snapshot_child(c, id, 0); ++} ++ ++static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) ++{ ++ return bch2_snapshot_child(c, id, 1); ++} ++ ++static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) ++{ ++ u32 n, parent; ++ ++ n = bch2_snapshot_left_child(c, id); ++ if (n) ++ return n; ++ ++ while ((parent = bch2_snapshot_parent(c, id))) { ++ n = bch2_snapshot_right_child(c, parent); ++ if (n && n != id) ++ return n; ++ id = parent; ++ } ++ ++ return 0; ++} ++ ++static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) ++{ ++ u32 id = snapshot_root; ++ u32 subvol = 0, s; ++ ++ while (id) { ++ s = snapshot_t(c, id)->subvol; ++ ++ if (s && (!subvol || s < subvol)) ++ subvol = s; ++ ++ id = bch2_snapshot_tree_next(c, id); ++ } ++ ++ return subvol; ++} ++ ++static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, ++ u32 snapshot_root, u32 *subvol_id) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_subvolume s; ++ bool found = false; ++ int ret; ++ ++ for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, ++ 0, k, ret) { ++ if (k.k->type != KEY_TYPE_subvolume) ++ continue; ++ ++ s = bkey_s_c_to_subvolume(k); ++ if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) ++ continue; ++ if (!BCH_SUBVOLUME_SNAP(s.v)) { ++ *subvol_id = s.k->p.offset; ++ found = true; ++ break; ++ } ++ } ++ ++ bch2_trans_iter_exit(trans, &iter); ++ ++ if (!ret && !found) { ++ struct bkey_i_subvolume *s; ++ ++ *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); ++ ++ s = bch2_bkey_get_mut_typed(trans, &iter, ++ BTREE_ID_subvolumes, POS(0, *subvol_id), ++ 0, subvolume); ++ ret = PTR_ERR_OR_ZERO(s); ++ if (ret) ++ return ret; ++ ++ SET_BCH_SUBVOLUME_SNAP(&s->v, false); ++ } ++ ++ return ret; ++} ++ ++static int check_snapshot_tree(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c_snapshot_tree st; ++ struct bch_snapshot s; ++ struct bch_subvolume subvol; ++ struct printbuf buf = PRINTBUF; ++ u32 root_id; ++ int ret; ++ ++ if (k.k->type != KEY_TYPE_snapshot_tree) ++ return 0; ++ ++ st = bkey_s_c_to_snapshot_tree(k); ++ root_id = le32_to_cpu(st.v->root_snapshot); ++ ++ ret = bch2_snapshot_lookup(trans, root_id, &s); ++ if (ret && !bch2_err_matches(ret, ENOENT)) ++ goto err; ++ ++ if (fsck_err_on(ret || ++ root_id != bch2_snapshot_root(c, root_id) || ++ st.k->p.offset != le32_to_cpu(s.tree), ++ c, ++ "snapshot tree points to missing/incorrect snapshot:\n %s", ++ (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, iter, 0); ++ goto err; ++ } ++ ++ ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), ++ false, 0, &subvol); ++ if (ret && !bch2_err_matches(ret, ENOENT)) ++ goto err; ++ ++ if (fsck_err_on(ret, c, ++ "snapshot tree points to missing subvolume:\n %s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || ++ fsck_err_on(!bch2_snapshot_is_ancestor_early(c, ++ le32_to_cpu(subvol.snapshot), ++ root_id), c, ++ "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || ++ fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c, ++ "snapshot tree points to snapshot subvolume:\n %s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { ++ struct bkey_i_snapshot_tree *u; ++ u32 subvol_id; ++ ++ ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); ++ if (ret) ++ goto err; ++ ++ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ goto err; ++ ++ u->v.master_subvol = cpu_to_le32(subvol_id); ++ st = snapshot_tree_i_to_s_c(u); ++ } ++err: ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++/* ++ * For each snapshot_tree, make sure it points to the root of a snapshot tree ++ * and that snapshot entry points back to it, or delete it. ++ * ++ * And, make sure it points to a subvolume within that snapshot tree, or correct ++ * it to point to the oldest subvolume within that snapshot tree. ++ */ ++int bch2_check_snapshot_trees(struct bch_fs *c) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ ret = bch2_trans_run(c, ++ for_each_btree_key_commit(&trans, iter, ++ BTREE_ID_snapshot_trees, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_snapshot_tree(&trans, &iter, k))); ++ ++ if (ret) ++ bch_err(c, "error %i checking snapshot trees", ret); ++ return ret; ++} ++ ++/* ++ * Look up snapshot tree for @tree_id and find root, ++ * make sure @snap_id is a descendent: ++ */ ++static int snapshot_tree_ptr_good(struct btree_trans *trans, ++ u32 snap_id, u32 tree_id) ++{ ++ struct bch_snapshot_tree s_t; ++ int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); ++ ++ if (bch2_err_matches(ret, ENOENT)) ++ return 0; ++ if (ret) ++ return ret; ++ ++ return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); ++} ++ ++u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) ++{ ++ const struct snapshot_t *s; ++ ++ if (!id) ++ return 0; ++ ++ rcu_read_lock(); ++ s = snapshot_t(c, id); ++ if (s->parent) ++ id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); ++ rcu_read_unlock(); ++ ++ return id; ++} ++ ++static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) ++{ ++ unsigned i; ++ ++ for (i = 0; i < 3; i++) ++ if (!s.parent) { ++ if (s.skip[i]) ++ return false; ++ } else { ++ if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i]))) ++ return false; ++ } ++ ++ return true; ++} ++ ++/* ++ * snapshot_tree pointer was incorrect: look up root snapshot node, make sure ++ * its snapshot_tree pointer is correct (allocate new one if necessary), then ++ * update this node's pointer to root node's pointer: ++ */ ++static int snapshot_tree_ptr_repair(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ struct bch_snapshot *s) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter root_iter; ++ struct bch_snapshot_tree s_t; ++ struct bkey_s_c_snapshot root; ++ struct bkey_i_snapshot *u; ++ u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; ++ int ret; ++ ++ root = bch2_bkey_get_iter_typed(trans, &root_iter, ++ BTREE_ID_snapshots, POS(0, root_id), ++ BTREE_ITER_WITH_UPDATES, snapshot); ++ ret = bkey_err(root); ++ if (ret) ++ goto err; ++ ++ tree_id = le32_to_cpu(root.v->tree); ++ ++ ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); ++ if (ret && !bch2_err_matches(ret, ENOENT)) ++ return ret; ++ ++ if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) { ++ u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); ++ ret = PTR_ERR_OR_ZERO(u) ?: ++ bch2_snapshot_tree_create(trans, root_id, ++ bch2_snapshot_tree_oldest_subvol(c, root_id), ++ &tree_id); ++ if (ret) ++ goto err; ++ ++ u->v.tree = cpu_to_le32(tree_id); ++ if (k.k->p.offset == root_id) ++ *s = u->v; ++ } ++ ++ if (k.k->p.offset != root_id) { ++ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ goto err; ++ ++ u->v.tree = cpu_to_le32(tree_id); ++ *s = u->v; ++ } ++err: ++ bch2_trans_iter_exit(trans, &root_iter); ++ return ret; ++} ++ ++static int check_snapshot(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_snapshot s; ++ struct bch_subvolume subvol; ++ struct bch_snapshot v; ++ struct bkey_i_snapshot *u; ++ u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); ++ u32 real_depth; ++ struct printbuf buf = PRINTBUF; ++ bool should_have_subvol; ++ u32 i, id; ++ int ret = 0; ++ ++ if (k.k->type != KEY_TYPE_snapshot) ++ return 0; ++ ++ memset(&s, 0, sizeof(s)); ++ memcpy(&s, k.v, bkey_val_bytes(k.k)); ++ ++ id = le32_to_cpu(s.parent); ++ if (id) { ++ ret = bch2_snapshot_lookup(trans, id, &v); ++ if (bch2_err_matches(ret, ENOENT)) ++ bch_err(c, "snapshot with nonexistent parent:\n %s", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ if (ret) ++ goto err; ++ ++ if (le32_to_cpu(v.children[0]) != k.k->p.offset && ++ le32_to_cpu(v.children[1]) != k.k->p.offset) { ++ bch_err(c, "snapshot parent %u missing pointer to child %llu", ++ id, k.k->p.offset); ++ ret = -EINVAL; ++ goto err; ++ } ++ } ++ ++ for (i = 0; i < 2 && s.children[i]; i++) { ++ id = le32_to_cpu(s.children[i]); ++ ++ ret = bch2_snapshot_lookup(trans, id, &v); ++ if (bch2_err_matches(ret, ENOENT)) ++ bch_err(c, "snapshot node %llu has nonexistent child %u", ++ k.k->p.offset, id); ++ if (ret) ++ goto err; ++ ++ if (le32_to_cpu(v.parent) != k.k->p.offset) { ++ bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", ++ id, le32_to_cpu(v.parent), k.k->p.offset); ++ ret = -EINVAL; ++ goto err; ++ } ++ } ++ ++ should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && ++ !BCH_SNAPSHOT_DELETED(&s); ++ ++ if (should_have_subvol) { ++ id = le32_to_cpu(s.subvol); ++ ret = bch2_subvolume_get(trans, id, 0, false, &subvol); ++ if (bch2_err_matches(ret, ENOENT)) ++ bch_err(c, "snapshot points to nonexistent subvolume:\n %s", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); ++ if (ret) ++ goto err; ++ ++ if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { ++ bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", ++ k.k->p.offset); ++ ret = -EINVAL; ++ goto err; ++ } ++ } else { ++ if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n %s", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ goto err; ++ ++ u->v.subvol = 0; ++ s = u->v; ++ } ++ } ++ ++ ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); ++ if (ret < 0) ++ goto err; ++ ++ if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n %s", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = snapshot_tree_ptr_repair(trans, iter, k, &s); ++ if (ret) ++ goto err; ++ } ++ ret = 0; ++ ++ real_depth = bch2_snapshot_depth(c, parent_id); ++ ++ if (le32_to_cpu(s.depth) != real_depth && ++ (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || ++ fsck_err(c, "snapshot with incorrect depth field, should be %u:\n %s", ++ real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { ++ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ goto err; ++ ++ u->v.depth = cpu_to_le32(real_depth); ++ s = u->v; ++ } ++ ++ ret = snapshot_skiplist_good(trans, k.k->p.offset, s); ++ if (ret < 0) ++ goto err; ++ ++ if (!ret && ++ (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || ++ fsck_err(c, "snapshot with bad skiplist field:\n %s", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { ++ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ goto err; ++ ++ for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) ++ u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id)); ++ ++ bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32); ++ s = u->v; ++ } ++ ret = 0; ++err: ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++int bch2_check_snapshots(struct bch_fs *c) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ /* ++ * We iterate backwards as checking/fixing the depth field requires that ++ * the parent's depth already be correct: ++ */ ++ ret = bch2_trans_run(c, ++ for_each_btree_key_reverse_commit(&trans, iter, ++ BTREE_ID_snapshots, POS_MAX, ++ BTREE_ITER_PREFETCH, k, ++ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ++ check_snapshot(&trans, &iter, k))); ++ if (ret) ++ bch_err_fn(c, ret); ++ return ret; ++} ++ ++/* ++ * Mark a snapshot as deleted, for future cleanup: ++ */ ++int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) ++{ ++ struct btree_iter iter; ++ struct bkey_i_snapshot *s; ++ int ret = 0; ++ ++ s = bch2_bkey_get_mut_typed(trans, &iter, ++ BTREE_ID_snapshots, POS(0, id), ++ 0, snapshot); ++ ret = PTR_ERR_OR_ZERO(s); ++ if (unlikely(ret)) { ++ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), ++ trans->c, "missing snapshot %u", id); ++ return ret; ++ } ++ ++ /* already deleted? */ ++ if (BCH_SNAPSHOT_DELETED(&s->v)) ++ goto err; ++ ++ SET_BCH_SNAPSHOT_DELETED(&s->v, true); ++ SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); ++ s->v.subvol = 0; ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) ++{ ++ if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1])) ++ swap(s->children[0], s->children[1]); ++} ++ ++int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; ++ struct btree_iter c_iter = (struct btree_iter) { NULL }; ++ struct btree_iter tree_iter = (struct btree_iter) { NULL }; ++ struct bkey_s_c_snapshot s; ++ u32 parent_id, child_id; ++ unsigned i; ++ int ret = 0; ++ ++ s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), ++ BTREE_ITER_INTENT, snapshot); ++ ret = bkey_err(s); ++ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, ++ "missing snapshot %u", id); ++ ++ if (ret) ++ goto err; ++ ++ BUG_ON(s.v->children[1]); ++ ++ parent_id = le32_to_cpu(s.v->parent); ++ child_id = le32_to_cpu(s.v->children[0]); ++ ++ if (parent_id) { ++ struct bkey_i_snapshot *parent; ++ ++ parent = bch2_bkey_get_mut_typed(trans, &p_iter, ++ BTREE_ID_snapshots, POS(0, parent_id), ++ 0, snapshot); ++ ret = PTR_ERR_OR_ZERO(parent); ++ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, ++ "missing snapshot %u", parent_id); ++ if (unlikely(ret)) ++ goto err; ++ ++ /* find entry in parent->children for node being deleted */ ++ for (i = 0; i < 2; i++) ++ if (le32_to_cpu(parent->v.children[i]) == id) ++ break; ++ ++ if (bch2_fs_inconsistent_on(i == 2, c, ++ "snapshot %u missing child pointer to %u", ++ parent_id, id)) ++ goto err; ++ ++ parent->v.children[i] = le32_to_cpu(child_id); ++ ++ normalize_snapshot_child_pointers(&parent->v); ++ } ++ ++ if (child_id) { ++ struct bkey_i_snapshot *child; ++ ++ child = bch2_bkey_get_mut_typed(trans, &c_iter, ++ BTREE_ID_snapshots, POS(0, child_id), ++ 0, snapshot); ++ ret = PTR_ERR_OR_ZERO(child); ++ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, ++ "missing snapshot %u", child_id); ++ if (unlikely(ret)) ++ goto err; ++ ++ child->v.parent = cpu_to_le32(parent_id); ++ ++ if (!child->v.parent) { ++ child->v.skip[0] = 0; ++ child->v.skip[1] = 0; ++ child->v.skip[2] = 0; ++ } ++ } ++ ++ if (!parent_id) { ++ /* ++ * We're deleting the root of a snapshot tree: update the ++ * snapshot_tree entry to point to the new root, or delete it if ++ * this is the last snapshot ID in this tree: ++ */ ++ struct bkey_i_snapshot_tree *s_t; ++ ++ BUG_ON(s.v->children[1]); ++ ++ s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, ++ BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), ++ 0, snapshot_tree); ++ ret = PTR_ERR_OR_ZERO(s_t); ++ if (ret) ++ goto err; ++ ++ if (s.v->children[0]) { ++ s_t->v.root_snapshot = s.v->children[0]; ++ } else { ++ s_t->k.type = KEY_TYPE_deleted; ++ set_bkey_val_u64s(&s_t->k, 0); ++ } ++ } ++ ++ ret = bch2_btree_delete_at(trans, &iter, 0); ++err: ++ bch2_trans_iter_exit(trans, &tree_iter); ++ bch2_trans_iter_exit(trans, &p_iter); ++ bch2_trans_iter_exit(trans, &c_iter); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, ++ u32 *new_snapids, ++ u32 *snapshot_subvols, ++ unsigned nr_snapids) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_i_snapshot *n; ++ struct bkey_s_c k; ++ unsigned i, j; ++ u32 depth = bch2_snapshot_depth(c, parent); ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, ++ POS_MIN, BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ for (i = 0; i < nr_snapids; i++) { ++ k = bch2_btree_iter_prev_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (!k.k || !k.k->p.offset) { ++ ret = -BCH_ERR_ENOSPC_snapshot_create; ++ goto err; ++ } ++ ++ n = bch2_bkey_alloc(trans, &iter, 0, snapshot); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ goto err; ++ ++ n->v.flags = 0; ++ n->v.parent = cpu_to_le32(parent); ++ n->v.subvol = cpu_to_le32(snapshot_subvols[i]); ++ n->v.tree = cpu_to_le32(tree); ++ n->v.depth = cpu_to_le32(depth); ++ ++ for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) ++ n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); ++ ++ bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); ++ SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); ++ ++ ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, ++ bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); ++ if (ret) ++ goto err; ++ ++ new_snapids[i] = iter.pos.offset; ++ } ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++/* ++ * Create new snapshot IDs as children of an existing snapshot ID: ++ */ ++static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, ++ u32 *new_snapids, ++ u32 *snapshot_subvols, ++ unsigned nr_snapids) ++{ ++ struct btree_iter iter; ++ struct bkey_i_snapshot *n_parent; ++ int ret = 0; ++ ++ n_parent = bch2_bkey_get_mut_typed(trans, &iter, ++ BTREE_ID_snapshots, POS(0, parent), ++ 0, snapshot); ++ ret = PTR_ERR_OR_ZERO(n_parent); ++ if (unlikely(ret)) { ++ if (bch2_err_matches(ret, ENOENT)) ++ bch_err(trans->c, "snapshot %u not found", parent); ++ return ret; ++ } ++ ++ if (n_parent->v.children[0] || n_parent->v.children[1]) { ++ bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), ++ new_snapids, snapshot_subvols, nr_snapids); ++ if (ret) ++ goto err; ++ ++ n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); ++ n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); ++ n_parent->v.subvol = 0; ++ SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++/* ++ * Create a snapshot node that is the root of a new tree: ++ */ ++static int bch2_snapshot_node_create_tree(struct btree_trans *trans, ++ u32 *new_snapids, ++ u32 *snapshot_subvols, ++ unsigned nr_snapids) ++{ ++ struct bkey_i_snapshot_tree *n_tree; ++ int ret; ++ ++ n_tree = __bch2_snapshot_tree_create(trans); ++ ret = PTR_ERR_OR_ZERO(n_tree) ?: ++ create_snapids(trans, 0, n_tree->k.p.offset, ++ new_snapids, snapshot_subvols, nr_snapids); ++ if (ret) ++ return ret; ++ ++ n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); ++ n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); ++ return 0; ++} ++ ++int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, ++ u32 *new_snapids, ++ u32 *snapshot_subvols, ++ unsigned nr_snapids) ++{ ++ BUG_ON((parent == 0) != (nr_snapids == 1)); ++ BUG_ON((parent != 0) != (nr_snapids == 2)); ++ ++ return parent ++ ? bch2_snapshot_node_create_children(trans, parent, ++ new_snapids, snapshot_subvols, nr_snapids) ++ : bch2_snapshot_node_create_tree(trans, ++ new_snapids, snapshot_subvols, nr_snapids); ++ ++} ++ ++/* ++ * If we have an unlinked inode in an internal snapshot node, and the inode ++ * really has been deleted in all child snapshots, how does this get cleaned up? ++ * ++ * first there is the problem of how keys that have been overwritten in all ++ * child snapshots get deleted (unimplemented?), but inodes may perhaps be ++ * special? ++ * ++ * also: unlinked inode in internal snapshot appears to not be getting deleted ++ * correctly if inode doesn't exist in leaf snapshots ++ * ++ * solution: ++ * ++ * for a key in an interior snapshot node that needs work to be done that ++ * requires it to be mutated: iterate over all descendent leaf nodes and copy ++ * that key to snapshot leaf nodes, where we can mutate it ++ */ ++ ++static int snapshot_delete_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ snapshot_id_list *deleted, ++ snapshot_id_list *equiv_seen, ++ struct bpos *last_pos) ++{ ++ struct bch_fs *c = trans->c; ++ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); ++ ++ if (!bkey_eq(k.k->p, *last_pos)) ++ equiv_seen->nr = 0; ++ *last_pos = k.k->p; ++ ++ if (snapshot_list_has_id(deleted, k.k->p.snapshot) || ++ snapshot_list_has_id(equiv_seen, equiv)) { ++ return bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ } else { ++ return snapshot_list_add(c, equiv_seen, equiv); ++ } ++} ++ ++static int move_key_to_correct_snapshot(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); ++ ++ /* ++ * When we have a linear chain of snapshot nodes, we consider ++ * those to form an equivalence class: we're going to collapse ++ * them all down to a single node, and keep the leaf-most node - ++ * which has the same id as the equivalence class id. ++ * ++ * If there are multiple keys in different snapshots at the same ++ * position, we're only going to keep the one in the newest ++ * snapshot - the rest have been overwritten and are redundant, ++ * and for the key we're going to keep we need to move it to the ++ * equivalance class ID if it's not there already. ++ */ ++ if (equiv != k.k->p.snapshot) { ++ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ++ struct btree_iter new_iter; ++ int ret; ++ ++ ret = PTR_ERR_OR_ZERO(new); ++ if (ret) ++ return ret; ++ ++ new->k.p.snapshot = equiv; ++ ++ bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, ++ BTREE_ITER_ALL_SNAPSHOTS| ++ BTREE_ITER_CACHED| ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_iter_traverse(&new_iter) ?: ++ bch2_trans_update(trans, &new_iter, new, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: ++ bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ++ bch2_trans_iter_exit(trans, &new_iter); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* ++ * For a given snapshot, if it doesn't have a subvolume that points to it, and ++ * it doesn't have child snapshot nodes - it's now redundant and we can mark it ++ * as deleted. ++ */ ++static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_snapshot snap; ++ u32 children[2]; ++ int ret; ++ ++ if (k.k->type != KEY_TYPE_snapshot) ++ return 0; ++ ++ snap = bkey_s_c_to_snapshot(k); ++ if (BCH_SNAPSHOT_DELETED(snap.v) || ++ BCH_SNAPSHOT_SUBVOL(snap.v)) ++ return 0; ++ ++ children[0] = le32_to_cpu(snap.v->children[0]); ++ children[1] = le32_to_cpu(snap.v->children[1]); ++ ++ ret = bch2_snapshot_live(trans, children[0]) ?: ++ bch2_snapshot_live(trans, children[1]); ++ if (ret < 0) ++ return ret; ++ ++ if (!ret) ++ return bch2_snapshot_node_set_deleted(trans, k.k->p.offset); ++ return 0; ++} ++ ++static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, ++ snapshot_id_list *skip) ++{ ++ rcu_read_lock(); ++ while (n--) { ++ do { ++ id = __bch2_snapshot_parent(c, id); ++ } while (snapshot_list_has_id(skip, id)); ++ } ++ rcu_read_unlock(); ++ ++ return id; ++} ++ ++static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, ++ struct btree_iter *iter, struct bkey_s_c k, ++ snapshot_id_list *deleted) ++{ ++ struct bch_fs *c = trans->c; ++ u32 nr_deleted_ancestors = 0; ++ struct bkey_i_snapshot *s; ++ u32 *i; ++ int ret; ++ ++ if (k.k->type != KEY_TYPE_snapshot) ++ return 0; ++ ++ if (snapshot_list_has_id(deleted, k.k->p.offset)) ++ return 0; ++ ++ s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); ++ ret = PTR_ERR_OR_ZERO(s); ++ if (ret) ++ return ret; ++ ++ darray_for_each(*deleted, i) ++ nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i); ++ ++ if (!nr_deleted_ancestors) ++ return 0; ++ ++ le32_add_cpu(&s->v.depth, -nr_deleted_ancestors); ++ ++ if (!s->v.depth) { ++ s->v.skip[0] = 0; ++ s->v.skip[1] = 0; ++ s->v.skip[2] = 0; ++ } else { ++ u32 depth = le32_to_cpu(s->v.depth); ++ u32 parent = bch2_snapshot_parent(c, s->k.p.offset); ++ ++ for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { ++ u32 id = le32_to_cpu(s->v.skip[j]); ++ ++ if (snapshot_list_has_id(deleted, id)) { ++ id = depth > 1 ++ ? bch2_snapshot_nth_parent_skip(c, ++ parent, ++ get_random_u32_below(depth - 1), ++ deleted) ++ : parent; ++ s->v.skip[j] = cpu_to_le32(id); ++ } ++ } ++ ++ bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32); ++ } ++ ++ return bch2_trans_update(trans, iter, &s->k_i, 0); ++} ++ ++int bch2_delete_dead_snapshots(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_snapshot snap; ++ snapshot_id_list deleted = { 0 }; ++ snapshot_id_list deleted_interior = { 0 }; ++ u32 *i, id; ++ int ret = 0; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) { ++ ret = bch2_fs_read_write_early(c); ++ if (ret) { ++ bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret)); ++ return ret; ++ } ++ } ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ /* ++ * For every snapshot node: If we have no live children and it's not ++ * pointed to by a subvolume, delete it: ++ */ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ++ NULL, NULL, 0, ++ bch2_delete_redundant_snapshot(&trans, &iter, k)); ++ if (ret) { ++ bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ++ bch2_snapshot_set_equiv(&trans, k)); ++ if (ret) { ++ bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret)); ++ goto err; ++ } ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_snapshot) ++ continue; ++ ++ snap = bkey_s_c_to_snapshot(k); ++ if (BCH_SNAPSHOT_DELETED(snap.v)) { ++ ret = snapshot_list_add(c, &deleted, k.k->p.offset); ++ if (ret) ++ break; ++ } ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ if (ret) { ++ bch_err_msg(c, ret, "walking snapshots"); ++ goto err; ++ } ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ struct bpos last_pos = POS_MIN; ++ snapshot_id_list equiv_seen = { 0 }; ++ struct disk_reservation res = { 0 }; ++ ++ if (!btree_type_has_snapshots(id)) ++ continue; ++ ++ ret = for_each_btree_key_commit(&trans, iter, ++ id, POS_MIN, ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ &res, NULL, BTREE_INSERT_NOFAIL, ++ snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: ++ for_each_btree_key_commit(&trans, iter, ++ id, POS_MIN, ++ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ++ &res, NULL, BTREE_INSERT_NOFAIL, ++ move_key_to_correct_snapshot(&trans, &iter, k)); ++ ++ bch2_disk_reservation_put(c, &res); ++ darray_exit(&equiv_seen); ++ ++ if (ret) { ++ bch_err_msg(c, ret, "deleting keys from dying snapshots"); ++ goto err; ++ } ++ } ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ret) { ++ u32 snapshot = k.k->p.offset; ++ u32 equiv = bch2_snapshot_equiv(c, snapshot); ++ ++ if (equiv != snapshot) ++ snapshot_list_add(c, &deleted_interior, snapshot); ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ ++ /* ++ * Fixing children of deleted snapshots can't be done completely ++ * atomically, if we crash between here and when we delete the interior ++ * nodes some depth fields will be off: ++ */ ++ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, POS_MIN, ++ BTREE_ITER_INTENT, k, ++ NULL, NULL, BTREE_INSERT_NOFAIL, ++ bch2_fix_child_of_deleted_snapshot(&trans, &iter, k, &deleted_interior)); ++ if (ret) ++ goto err; ++ ++ darray_for_each(deleted, i) { ++ ret = commit_do(&trans, NULL, NULL, 0, ++ bch2_snapshot_node_delete(&trans, *i)); ++ if (ret) { ++ bch_err_msg(c, ret, "deleting snapshot %u", *i); ++ goto err; ++ } ++ } ++ ++ darray_for_each(deleted_interior, i) { ++ ret = commit_do(&trans, NULL, NULL, 0, ++ bch2_snapshot_node_delete(&trans, *i)); ++ if (ret) { ++ bch_err_msg(c, ret, "deleting snapshot %u", *i); ++ goto err; ++ } ++ } ++ ++ clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); ++err: ++ darray_exit(&deleted_interior); ++ darray_exit(&deleted); ++ bch2_trans_exit(&trans); ++ if (ret) ++ bch_err_fn(c, ret); ++ return ret; ++} ++ ++void bch2_delete_dead_snapshots_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); ++ ++ if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) ++ bch2_delete_dead_snapshots(c); ++ bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); ++} ++ ++void bch2_delete_dead_snapshots_async(struct bch_fs *c) ++{ ++ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && ++ !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) ++ bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); ++} ++ ++int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, ++ struct btree_trans_commit_hook *h) ++{ ++ struct bch_fs *c = trans->c; ++ ++ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); ++ ++ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots) ++ return 0; ++ ++ bch2_delete_dead_snapshots_async(c); ++ return 0; ++} ++ ++int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, ++ enum btree_id id, ++ struct bpos pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_iter_init(trans, &iter, id, pos, ++ BTREE_ITER_NOT_EXTENTS| ++ BTREE_ITER_ALL_SNAPSHOTS); ++ while (1) { ++ k = bch2_btree_iter_prev(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ if (!k.k) ++ break; ++ ++ if (!bkey_eq(pos, k.k->p)) ++ break; ++ ++ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { ++ ret = 1; ++ break; ++ } ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ return ret; ++} ++ ++static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id) ++{ ++ const struct snapshot_t *s = snapshot_t(c, id); ++ ++ return s->children[1] ?: s->children[0]; ++} ++ ++static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id) ++{ ++ u32 child; ++ ++ while ((child = bch2_snapshot_smallest_child(c, id))) ++ id = child; ++ return id; ++} ++ ++static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, ++ enum btree_id btree, ++ struct bkey_s_c interior_k, ++ u32 leaf_id, struct bpos *new_min_pos) ++{ ++ struct btree_iter iter; ++ struct bpos pos = interior_k.k->p; ++ struct bkey_s_c k; ++ struct bkey_i *new; ++ int ret; ++ ++ pos.snapshot = leaf_id; ++ ++ bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto out; ++ ++ /* key already overwritten in this snapshot? */ ++ if (k.k->p.snapshot != interior_k.k->p.snapshot) ++ goto out; ++ ++ if (bpos_eq(*new_min_pos, POS_MIN)) { ++ *new_min_pos = k.k->p; ++ new_min_pos->snapshot = leaf_id; ++ } ++ ++ new = bch2_bkey_make_mut_noupdate(trans, interior_k); ++ ret = PTR_ERR_OR_ZERO(new); ++ if (ret) ++ goto out; ++ ++ new->k.p.snapshot = leaf_id; ++ ret = bch2_trans_update(trans, &iter, new, 0); ++out: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, ++ enum btree_id btree, ++ struct bkey_s_c k, ++ struct bpos *new_min_pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_buf sk; ++ int ret; ++ ++ bch2_bkey_buf_init(&sk); ++ bch2_bkey_buf_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ ++ *new_min_pos = POS_MIN; ++ ++ for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot); ++ id < k.k->p.snapshot; ++ id++) { ++ if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) || ++ !bch2_snapshot_is_leaf(c, id)) ++ continue; ++ ++ ret = commit_do(trans, NULL, NULL, 0, ++ bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos)); ++ if (ret) ++ break; ++ } ++ ++ bch2_bkey_buf_exit(&sk, c); ++ return ret; ++} ++ ++int bch2_snapshots_read(struct bch_fs *c) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ ret = bch2_trans_run(c, ++ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ++ bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: ++ bch2_snapshot_set_equiv(&trans, k)) ?: ++ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, ++ POS_MIN, 0, k, ++ (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); ++ if (ret) ++ bch_err_fn(c, ret); ++ return ret; ++} ++ ++void bch2_fs_snapshots_exit(struct bch_fs *c) ++{ ++ kfree(rcu_dereference_protected(c->snapshots, true)); ++} +diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h +new file mode 100644 +index 000000000..dabc9b9d9 +--- /dev/null ++++ b/fs/bcachefs/snapshot.h +@@ -0,0 +1,272 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SNAPSHOT_H ++#define _BCACHEFS_SNAPSHOT_H ++ ++enum bkey_invalid_flags; ++ ++void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); ++ ++#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ ++ .key_invalid = bch2_snapshot_tree_invalid, \ ++ .val_to_text = bch2_snapshot_tree_to_text, \ ++ .min_val_size = 8, \ ++}) ++ ++struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); ++ ++int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); ++ ++void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); ++int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, ++ struct bkey_s_c, struct bkey_s_c, unsigned); ++ ++#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ ++ .key_invalid = bch2_snapshot_invalid, \ ++ .val_to_text = bch2_snapshot_to_text, \ ++ .atomic_trigger = bch2_mark_snapshot, \ ++ .min_val_size = 24, \ ++}) ++ ++static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) ++{ ++ return &t->s[U32_MAX - id]; ++} ++ ++static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) ++{ ++ return __snapshot_t(rcu_dereference(c->snapshots), id); ++} ++ ++static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) ++{ ++ rcu_read_lock(); ++ id = snapshot_t(c, id)->tree; ++ rcu_read_unlock(); ++ ++ return id; ++} ++ ++static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) ++{ ++ return snapshot_t(c, id)->parent; ++} ++ ++static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) ++{ ++ rcu_read_lock(); ++ id = __bch2_snapshot_parent_early(c, id); ++ rcu_read_unlock(); ++ ++ return id; ++} ++ ++static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ u32 parent = snapshot_t(c, id)->parent; ++ ++ if (parent && ++ snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) ++ panic("id %u depth=%u parent %u depth=%u\n", ++ id, snapshot_t(c, id)->depth, ++ parent, snapshot_t(c, parent)->depth); ++ ++ return parent; ++#else ++ return snapshot_t(c, id)->parent; ++#endif ++} ++ ++static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) ++{ ++ rcu_read_lock(); ++ id = __bch2_snapshot_parent(c, id); ++ rcu_read_unlock(); ++ ++ return id; ++} ++ ++static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) ++{ ++ rcu_read_lock(); ++ while (n--) ++ id = __bch2_snapshot_parent(c, id); ++ rcu_read_unlock(); ++ ++ return id; ++} ++ ++u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); ++ ++static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) ++{ ++ u32 parent; ++ ++ rcu_read_lock(); ++ while ((parent = __bch2_snapshot_parent(c, id))) ++ id = parent; ++ rcu_read_unlock(); ++ ++ return id; ++} ++ ++static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) ++{ ++ return snapshot_t(c, id)->equiv; ++} ++ ++static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) ++{ ++ rcu_read_lock(); ++ id = __bch2_snapshot_equiv(c, id); ++ rcu_read_unlock(); ++ ++ return id; ++} ++ ++static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) ++{ ++ return id == bch2_snapshot_equiv(c, id); ++} ++ ++static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) ++{ ++ const struct snapshot_t *s; ++ bool ret; ++ ++ rcu_read_lock(); ++ s = snapshot_t(c, id); ++ ret = s->children[0]; ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) ++{ ++ return !bch2_snapshot_is_internal_node(c, id); ++} ++ ++static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) ++{ ++ const struct snapshot_t *s; ++ u32 parent = __bch2_snapshot_parent(c, id); ++ ++ if (!parent) ++ return 0; ++ ++ s = snapshot_t(c, __bch2_snapshot_parent(c, id)); ++ if (id == s->children[0]) ++ return s->children[1]; ++ if (id == s->children[1]) ++ return s->children[0]; ++ return 0; ++} ++ ++static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) ++{ ++ u32 depth; ++ ++ rcu_read_lock(); ++ depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; ++ rcu_read_unlock(); ++ ++ return depth; ++} ++ ++bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); ++ ++static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) ++{ ++ return id == ancestor ++ ? true ++ : __bch2_snapshot_is_ancestor(c, id, ancestor); ++} ++ ++static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) ++{ ++ const struct snapshot_t *t; ++ bool ret; ++ ++ rcu_read_lock(); ++ t = snapshot_t(c, id); ++ ret = (t->children[0]|t->children[1]) != 0; ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) ++{ ++ u32 *i; ++ ++ darray_for_each(*s, i) ++ if (*i == id) ++ return true; ++ return false; ++} ++ ++static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) ++{ ++ u32 *i; ++ ++ darray_for_each(*s, i) ++ if (bch2_snapshot_is_ancestor(c, id, *i)) ++ return true; ++ return false; ++} ++ ++static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) ++{ ++ int ret; ++ ++ BUG_ON(snapshot_list_has_id(s, id)); ++ ret = darray_push(s, id); ++ if (ret) ++ bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); ++ return ret; ++} ++ ++int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, ++ struct bch_snapshot *s); ++int bch2_snapshot_get_subvol(struct btree_trans *, u32, ++ struct bch_subvolume *); ++int bch2_snapshot_live(struct btree_trans *trans, u32 id); ++int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k); ++ ++/* only exported for tests: */ ++int bch2_snapshot_node_create(struct btree_trans *, u32, ++ u32 *, u32 *, unsigned); ++ ++int bch2_check_snapshot_trees(struct bch_fs *); ++int bch2_check_snapshots(struct bch_fs *); ++ ++int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); ++int bch2_delete_dead_snapshots_hook(struct btree_trans *, ++ struct btree_trans_commit_hook *); ++void bch2_delete_dead_snapshots_work(struct work_struct *); ++ ++int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); ++ ++static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, ++ enum btree_id id, ++ struct bpos pos) ++{ ++ if (!btree_type_has_snapshots(id) || ++ bch2_snapshot_is_leaf(trans->c, pos.snapshot)) ++ return 0; ++ ++ return __bch2_key_has_snapshot_overwrites(trans, id, pos); ++} ++ ++int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id, ++ struct bkey_s_c, struct bpos *); ++ ++int bch2_snapshots_read(struct bch_fs *); ++void bch2_fs_snapshots_exit(struct bch_fs *); ++ ++#endif /* _BCACHEFS_SNAPSHOT_H */ diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h new file mode 100644 index 000000000..ae21a8cca @@ -80812,10 +85220,10 @@ index 000000000..ae21a8cca +#endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c new file mode 100644 -index 000000000..811a6f428 +index 000000000..0214a98de --- /dev/null +++ b/fs/bcachefs/subvolume.c -@@ -0,0 +1,1749 @@ +@@ -0,0 +1,451 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -80824,861 +85232,13 @@ index 000000000..811a6f428 +#include "errcode.h" +#include "error.h" +#include "fs.h" ++#include "snapshot.h" +#include "subvolume.h" + +#include + +static int bch2_subvolume_delete(struct btree_trans *, u32); + -+static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) -+{ -+ const struct snapshot_t *s = __snapshot_t(t, id); -+ -+ if (s->skip[2] <= ancestor) -+ return s->skip[2]; -+ if (s->skip[1] <= ancestor) -+ return s->skip[1]; -+ if (s->skip[0] <= ancestor) -+ return s->skip[0]; -+ return s->parent; -+} -+ -+bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) -+{ -+ struct snapshot_table *t; -+ bool ret; -+ -+ EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); -+ -+ rcu_read_lock(); -+ t = rcu_dereference(c->snapshots); -+ -+ while (id && id < ancestor - IS_ANCESTOR_BITMAP) -+ id = get_ancestor_below(t, id, ancestor); -+ -+ ret = id && id < ancestor -+ ? test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor) -+ : id == ancestor; -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) -+{ -+ struct snapshot_table *t; -+ -+ rcu_read_lock(); -+ t = rcu_dereference(c->snapshots); -+ -+ while (id && id < ancestor) -+ id = __snapshot_t(t, id)->parent; -+ rcu_read_unlock(); -+ -+ return id == ancestor; -+} -+ -+static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) -+{ -+ u32 depth; -+ -+ rcu_read_lock(); -+ depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; -+ rcu_read_unlock(); -+ -+ return depth; -+} -+ -+static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) -+{ -+ size_t idx = U32_MAX - id; -+ size_t new_size; -+ struct snapshot_table *new, *old; -+ -+ new_size = max(16UL, roundup_pow_of_two(idx + 1)); -+ -+ new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); -+ if (!new) -+ return NULL; -+ -+ old = c->snapshots; -+ if (old) -+ memcpy(new->s, -+ rcu_dereference_protected(c->snapshots, true)->s, -+ sizeof(new->s[0]) * c->snapshot_table_size); -+ -+ rcu_assign_pointer(c->snapshots, new); -+ c->snapshot_table_size = new_size; -+ if (old) -+ kvfree_rcu(old); -+ -+ return &rcu_dereference_protected(c->snapshots, true)->s[idx]; -+} -+ -+static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) -+{ -+ size_t idx = U32_MAX - id; -+ -+ lockdep_assert_held(&c->snapshot_table_lock); -+ -+ if (likely(idx < c->snapshot_table_size)) -+ return &rcu_dereference_protected(c->snapshots, true)->s[idx]; -+ -+ return __snapshot_t_mut(c, id); -+} -+ -+/* Snapshot tree: */ -+ -+void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k); -+ -+ prt_printf(out, "subvol %u root snapshot %u", -+ le32_to_cpu(t.v->master_subvol), -+ le32_to_cpu(t.v->root_snapshot)); -+} -+ -+int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ if (bkey_gt(k.k->p, POS(0, U32_MAX)) || -+ bkey_lt(k.k->p, POS(0, 1))) { -+ prt_printf(err, "bad pos"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, -+ struct bch_snapshot_tree *s) -+{ -+ int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), -+ BTREE_ITER_WITH_UPDATES, snapshot_tree, s); -+ -+ if (bch2_err_matches(ret, ENOENT)) -+ ret = -BCH_ERR_ENOENT_snapshot_tree; -+ return ret; -+} -+ -+static struct bkey_i_snapshot_tree * -+__snapshot_tree_create(struct btree_trans *trans) -+{ -+ struct btree_iter iter; -+ int ret = bch2_bkey_get_empty_slot(trans, &iter, -+ BTREE_ID_snapshot_trees, POS(0, U32_MAX)); -+ struct bkey_i_snapshot_tree *s_t; -+ -+ if (ret == -BCH_ERR_ENOSPC_btree_slot) -+ ret = -BCH_ERR_ENOSPC_snapshot_tree; -+ if (ret) -+ return ERR_PTR(ret); -+ -+ s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); -+ ret = PTR_ERR_OR_ZERO(s_t); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret ? ERR_PTR(ret) : s_t; -+} -+ -+static int snapshot_tree_create(struct btree_trans *trans, -+ u32 root_id, u32 subvol_id, u32 *tree_id) -+{ -+ struct bkey_i_snapshot_tree *n_tree = -+ __snapshot_tree_create(trans); -+ -+ if (IS_ERR(n_tree)) -+ return PTR_ERR(n_tree); -+ -+ n_tree->v.master_subvol = cpu_to_le32(subvol_id); -+ n_tree->v.root_snapshot = cpu_to_le32(root_id); -+ *tree_id = n_tree->k.p.offset; -+ return 0; -+} -+ -+/* Snapshot nodes: */ -+ -+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); -+ -+ prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", -+ BCH_SNAPSHOT_SUBVOL(s.v), -+ BCH_SNAPSHOT_DELETED(s.v), -+ le32_to_cpu(s.v->parent), -+ le32_to_cpu(s.v->children[0]), -+ le32_to_cpu(s.v->children[1]), -+ le32_to_cpu(s.v->subvol), -+ le32_to_cpu(s.v->tree)); -+ -+ if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth)) -+ prt_printf(out, " depth %u skiplist %u %u %u", -+ le32_to_cpu(s.v->depth), -+ le32_to_cpu(s.v->skip[0]), -+ le32_to_cpu(s.v->skip[1]), -+ le32_to_cpu(s.v->skip[2])); -+} -+ -+int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ struct bkey_s_c_snapshot s; -+ u32 i, id; -+ -+ if (bkey_gt(k.k->p, POS(0, U32_MAX)) || -+ bkey_lt(k.k->p, POS(0, 1))) { -+ prt_printf(err, "bad pos"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ s = bkey_s_c_to_snapshot(k); -+ -+ id = le32_to_cpu(s.v->parent); -+ if (id && id <= k.k->p.offset) { -+ prt_printf(err, "bad parent node (%u <= %llu)", -+ id, k.k->p.offset); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) { -+ prt_printf(err, "children not normalized"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (s.v->children[0] && -+ s.v->children[0] == s.v->children[1]) { -+ prt_printf(err, "duplicate child nodes"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ for (i = 0; i < 2; i++) { -+ id = le32_to_cpu(s.v->children[i]); -+ -+ if (id >= k.k->p.offset) { -+ prt_printf(err, "bad child node (%u >= %llu)", -+ id, k.k->p.offset); -+ return -BCH_ERR_invalid_bkey; -+ } -+ } -+ -+ if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { -+ if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || -+ le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) { -+ prt_printf(err, "skiplist not normalized"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { -+ id = le32_to_cpu(s.v->skip[i]); -+ -+ if (!id != !s.v->parent || -+ (s.v->parent && -+ id <= k.k->p.offset)) { -+ prt_printf(err, "bad skiplist node %u)", id); -+ return -BCH_ERR_invalid_bkey; -+ } -+ } -+ } -+ -+ return 0; -+} -+ -+int bch2_mark_snapshot(struct btree_trans *trans, -+ enum btree_id btree, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct snapshot_t *t; -+ u32 id = new.k->p.offset; -+ int ret = 0; -+ -+ mutex_lock(&c->snapshot_table_lock); -+ -+ t = snapshot_t_mut(c, id); -+ if (!t) { -+ ret = -BCH_ERR_ENOMEM_mark_snapshot; -+ goto err; -+ } -+ -+ if (new.k->type == KEY_TYPE_snapshot) { -+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); -+ u32 parent = id; -+ -+ t->parent = le32_to_cpu(s.v->parent); -+ t->children[0] = le32_to_cpu(s.v->children[0]); -+ t->children[1] = le32_to_cpu(s.v->children[1]); -+ t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; -+ t->tree = le32_to_cpu(s.v->tree); -+ -+ if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) { -+ t->depth = le32_to_cpu(s.v->depth); -+ t->skip[0] = le32_to_cpu(s.v->skip[0]); -+ t->skip[1] = le32_to_cpu(s.v->skip[1]); -+ t->skip[2] = le32_to_cpu(s.v->skip[2]); -+ } else { -+ t->depth = 0; -+ t->skip[0] = 0; -+ t->skip[1] = 0; -+ t->skip[2] = 0; -+ } -+ -+ while ((parent = bch2_snapshot_parent_early(c, parent)) && -+ parent - id - 1 < IS_ANCESTOR_BITMAP) -+ __set_bit(parent - id - 1, t->is_ancestor); -+ -+ if (BCH_SNAPSHOT_DELETED(s.v)) { -+ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); -+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots); -+ } -+ } else { -+ memset(t, 0, sizeof(*t)); -+ } -+err: -+ mutex_unlock(&c->snapshot_table_lock); -+ return ret; -+} -+ -+static int snapshot_lookup(struct btree_trans *trans, u32 id, -+ struct bch_snapshot *s) -+{ -+ return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), -+ BTREE_ITER_WITH_UPDATES, snapshot, s); -+} -+ -+static int snapshot_live(struct btree_trans *trans, u32 id) -+{ -+ struct bch_snapshot v; -+ int ret; -+ -+ if (!id) -+ return 0; -+ -+ ret = snapshot_lookup(trans, id, &v); -+ if (bch2_err_matches(ret, ENOENT)) -+ bch_err(trans->c, "snapshot node %u not found", id); -+ if (ret) -+ return ret; -+ -+ return !BCH_SNAPSHOT_DELETED(&v); -+} -+ -+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned i, nr_live = 0, live_idx = 0; -+ struct bkey_s_c_snapshot snap; -+ u32 id = k.k->p.offset, child[2]; -+ -+ if (k.k->type != KEY_TYPE_snapshot) -+ return 0; -+ -+ snap = bkey_s_c_to_snapshot(k); -+ -+ child[0] = le32_to_cpu(snap.v->children[0]); -+ child[1] = le32_to_cpu(snap.v->children[1]); -+ -+ for (i = 0; i < 2; i++) { -+ int ret = snapshot_live(trans, child[i]); -+ -+ if (ret < 0) -+ return ret; -+ -+ if (ret) -+ live_idx = i; -+ nr_live += ret; -+ } -+ -+ mutex_lock(&c->snapshot_table_lock); -+ -+ snapshot_t_mut(c, id)->equiv = nr_live == 1 -+ ? snapshot_t_mut(c, child[live_idx])->equiv -+ : id; -+ -+ mutex_unlock(&c->snapshot_table_lock); -+ -+ return 0; -+} -+ -+/* fsck: */ -+ -+static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) -+{ -+ return snapshot_t(c, id)->children[child]; -+} -+ -+static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) -+{ -+ return bch2_snapshot_child(c, id, 0); -+} -+ -+static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) -+{ -+ return bch2_snapshot_child(c, id, 1); -+} -+ -+static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) -+{ -+ u32 n, parent; -+ -+ n = bch2_snapshot_left_child(c, id); -+ if (n) -+ return n; -+ -+ while ((parent = bch2_snapshot_parent(c, id))) { -+ n = bch2_snapshot_right_child(c, parent); -+ if (n && n != id) -+ return n; -+ id = parent; -+ } -+ -+ return 0; -+} -+ -+static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) -+{ -+ u32 id = snapshot_root; -+ u32 subvol = 0, s; -+ -+ while (id) { -+ s = snapshot_t(c, id)->subvol; -+ -+ if (s && (!subvol || s < subvol)) -+ subvol = s; -+ -+ id = bch2_snapshot_tree_next(c, id); -+ } -+ -+ return subvol; -+} -+ -+static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, -+ u32 snapshot_root, u32 *subvol_id) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_subvolume s; -+ bool found = false; -+ int ret; -+ -+ for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, -+ 0, k, ret) { -+ if (k.k->type != KEY_TYPE_subvolume) -+ continue; -+ -+ s = bkey_s_c_to_subvolume(k); -+ if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) -+ continue; -+ if (!BCH_SUBVOLUME_SNAP(s.v)) { -+ *subvol_id = s.k->p.offset; -+ found = true; -+ break; -+ } -+ } -+ -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (!ret && !found) { -+ struct bkey_i_subvolume *s; -+ -+ *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); -+ -+ s = bch2_bkey_get_mut_typed(trans, &iter, -+ BTREE_ID_subvolumes, POS(0, *subvol_id), -+ 0, subvolume); -+ ret = PTR_ERR_OR_ZERO(s); -+ if (ret) -+ return ret; -+ -+ SET_BCH_SUBVOLUME_SNAP(&s->v, false); -+ } -+ -+ return ret; -+} -+ -+static int check_snapshot_tree(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c_snapshot_tree st; -+ struct bch_snapshot s; -+ struct bch_subvolume subvol; -+ struct printbuf buf = PRINTBUF; -+ u32 root_id; -+ int ret; -+ -+ if (k.k->type != KEY_TYPE_snapshot_tree) -+ return 0; -+ -+ st = bkey_s_c_to_snapshot_tree(k); -+ root_id = le32_to_cpu(st.v->root_snapshot); -+ -+ ret = snapshot_lookup(trans, root_id, &s); -+ if (ret && !bch2_err_matches(ret, ENOENT)) -+ goto err; -+ -+ if (fsck_err_on(ret || -+ root_id != bch2_snapshot_root(c, root_id) || -+ st.k->p.offset != le32_to_cpu(s.tree), -+ c, -+ "snapshot tree points to missing/incorrect snapshot:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { -+ ret = bch2_btree_delete_at(trans, iter, 0); -+ goto err; -+ } -+ -+ ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), -+ false, 0, &subvol); -+ if (ret && !bch2_err_matches(ret, ENOENT)) -+ goto err; -+ -+ if (fsck_err_on(ret, c, -+ "snapshot tree points to missing subvolume:\n %s", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || -+ fsck_err_on(!bch2_snapshot_is_ancestor_early(c, -+ le32_to_cpu(subvol.snapshot), -+ root_id), c, -+ "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || -+ fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c, -+ "snapshot tree points to snapshot subvolume:\n %s", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { -+ struct bkey_i_snapshot_tree *u; -+ u32 subvol_id; -+ -+ ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); -+ if (ret) -+ goto err; -+ -+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ goto err; -+ -+ u->v.master_subvol = cpu_to_le32(subvol_id); -+ st = snapshot_tree_i_to_s_c(u); -+ } -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+/* -+ * For each snapshot_tree, make sure it points to the root of a snapshot tree -+ * and that snapshot entry points back to it, or delete it. -+ * -+ * And, make sure it points to a subvolume within that snapshot tree, or correct -+ * it to point to the oldest subvolume within that snapshot tree. -+ */ -+int bch2_check_snapshot_trees(struct bch_fs *c) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ ret = bch2_trans_run(c, -+ for_each_btree_key_commit(&trans, iter, -+ BTREE_ID_snapshot_trees, POS_MIN, -+ BTREE_ITER_PREFETCH, k, -+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, -+ check_snapshot_tree(&trans, &iter, k))); -+ -+ if (ret) -+ bch_err(c, "error %i checking snapshot trees", ret); -+ return ret; -+} -+ -+/* -+ * Look up snapshot tree for @tree_id and find root, -+ * make sure @snap_id is a descendent: -+ */ -+static int snapshot_tree_ptr_good(struct btree_trans *trans, -+ u32 snap_id, u32 tree_id) -+{ -+ struct bch_snapshot_tree s_t; -+ int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); -+ -+ if (bch2_err_matches(ret, ENOENT)) -+ return 0; -+ if (ret) -+ return ret; -+ -+ return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); -+} -+ -+static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id) -+{ -+ const struct snapshot_t *s; -+ -+ if (!id) -+ return 0; -+ -+ rcu_read_lock(); -+ s = snapshot_t(c, id); -+ if (s->parent) -+ id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s) -+{ -+ struct bch_snapshot a; -+ unsigned i; -+ int ret; -+ -+ for (i = 0; i < 3; i++) { -+ if (!s.parent != !s.skip[i]) -+ return false; -+ -+ if (!s.parent) -+ continue; -+ -+ ret = snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a); -+ if (bch2_err_matches(ret, ENOENT)) -+ return false; -+ if (ret) -+ return ret; -+ -+ if (a.tree != s.tree) -+ return false; -+ } -+ -+ return true; -+} -+ -+/* -+ * snapshot_tree pointer was incorrect: look up root snapshot node, make sure -+ * its snapshot_tree pointer is correct (allocate new one if necessary), then -+ * update this node's pointer to root node's pointer: -+ */ -+static int snapshot_tree_ptr_repair(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ struct bch_snapshot *s) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter root_iter; -+ struct bch_snapshot_tree s_t; -+ struct bkey_s_c_snapshot root; -+ struct bkey_i_snapshot *u; -+ u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; -+ int ret; -+ -+ root = bch2_bkey_get_iter_typed(trans, &root_iter, -+ BTREE_ID_snapshots, POS(0, root_id), -+ BTREE_ITER_WITH_UPDATES, snapshot); -+ ret = bkey_err(root); -+ if (ret) -+ goto err; -+ -+ tree_id = le32_to_cpu(root.v->tree); -+ -+ ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); -+ if (ret && !bch2_err_matches(ret, ENOENT)) -+ return ret; -+ -+ if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) { -+ u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(u) ?: -+ snapshot_tree_create(trans, root_id, -+ bch2_snapshot_tree_oldest_subvol(c, root_id), -+ &tree_id); -+ if (ret) -+ goto err; -+ -+ u->v.tree = cpu_to_le32(tree_id); -+ if (k.k->p.offset == root_id) -+ *s = u->v; -+ } -+ -+ if (k.k->p.offset != root_id) { -+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ goto err; -+ -+ u->v.tree = cpu_to_le32(tree_id); -+ *s = u->v; -+ } -+err: -+ bch2_trans_iter_exit(trans, &root_iter); -+ return ret; -+} -+ -+static int check_snapshot(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_snapshot s; -+ struct bch_subvolume subvol; -+ struct bch_snapshot v; -+ struct bkey_i_snapshot *u; -+ u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); -+ u32 real_depth; -+ struct printbuf buf = PRINTBUF; -+ bool should_have_subvol; -+ u32 i, id; -+ int ret = 0; -+ -+ if (k.k->type != KEY_TYPE_snapshot) -+ return 0; -+ -+ memset(&s, 0, sizeof(s)); -+ memcpy(&s, k.v, bkey_val_bytes(k.k)); -+ -+ id = le32_to_cpu(s.parent); -+ if (id) { -+ ret = snapshot_lookup(trans, id, &v); -+ if (bch2_err_matches(ret, ENOENT)) -+ bch_err(c, "snapshot with nonexistent parent:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -+ if (ret) -+ goto err; -+ -+ if (le32_to_cpu(v.children[0]) != k.k->p.offset && -+ le32_to_cpu(v.children[1]) != k.k->p.offset) { -+ bch_err(c, "snapshot parent %u missing pointer to child %llu", -+ id, k.k->p.offset); -+ ret = -EINVAL; -+ goto err; -+ } -+ } -+ -+ for (i = 0; i < 2 && s.children[i]; i++) { -+ id = le32_to_cpu(s.children[i]); -+ -+ ret = snapshot_lookup(trans, id, &v); -+ if (bch2_err_matches(ret, ENOENT)) -+ bch_err(c, "snapshot node %llu has nonexistent child %u", -+ k.k->p.offset, id); -+ if (ret) -+ goto err; -+ -+ if (le32_to_cpu(v.parent) != k.k->p.offset) { -+ bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", -+ id, le32_to_cpu(v.parent), k.k->p.offset); -+ ret = -EINVAL; -+ goto err; -+ } -+ } -+ -+ should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && -+ !BCH_SNAPSHOT_DELETED(&s); -+ -+ if (should_have_subvol) { -+ id = le32_to_cpu(s.subvol); -+ ret = bch2_subvolume_get(trans, id, 0, false, &subvol); -+ if (bch2_err_matches(ret, ENOENT)) -+ bch_err(c, "snapshot points to nonexistent subvolume:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -+ if (ret) -+ goto err; -+ -+ if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { -+ bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", -+ k.k->p.offset); -+ ret = -EINVAL; -+ goto err; -+ } -+ } else { -+ if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { -+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ goto err; -+ -+ u->v.subvol = 0; -+ s = u->v; -+ } -+ } -+ -+ ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); -+ if (ret < 0) -+ goto err; -+ -+ if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { -+ ret = snapshot_tree_ptr_repair(trans, iter, k, &s); -+ if (ret) -+ goto err; -+ } -+ ret = 0; -+ -+ real_depth = bch2_snapshot_depth(c, parent_id); -+ -+ if (le32_to_cpu(s.depth) != real_depth && -+ (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || -+ fsck_err(c, "snapshot with incorrect depth field, should be %u:\n %s", -+ real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { -+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ goto err; -+ -+ u->v.depth = cpu_to_le32(real_depth); -+ s = u->v; -+ } -+ -+ ret = snapshot_skiplist_good(trans, s); -+ if (ret < 0) -+ goto err; -+ -+ if (!ret && -+ (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || -+ fsck_err(c, "snapshot with bad skiplist field:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { -+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ goto err; -+ -+ for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) -+ u->v.skip[i] = cpu_to_le32(snapshot_skiplist_get(c, parent_id)); -+ -+ bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_int); -+ s = u->v; -+ } -+ ret = 0; -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+int bch2_check_snapshots(struct bch_fs *c) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ /* -+ * We iterate backwards as checking/fixing the depth field requires that -+ * the parent's depth already be correct: -+ */ -+ ret = bch2_trans_run(c, -+ for_each_btree_key_reverse_commit(&trans, iter, -+ BTREE_ID_snapshots, POS_MAX, -+ BTREE_ITER_PREFETCH, k, -+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, -+ check_snapshot(&trans, &iter, k))); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ +static int check_subvol(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) @@ -81694,7 +85254,7 @@ index 000000000..811a6f428 + + subvol = bkey_s_c_to_subvolume(k); + snapid = le32_to_cpu(subvol.v->snapshot); -+ ret = snapshot_lookup(trans, snapid, &snapshot); ++ ret = bch2_snapshot_lookup(trans, snapid, &snapshot); + + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "subvolume %llu points to nonexistent snapshot %u", @@ -81762,462 +85322,6 @@ index 000000000..811a6f428 + return ret; +} + -+void bch2_fs_snapshots_exit(struct bch_fs *c) -+{ -+ kfree(c->snapshots); -+} -+ -+int bch2_snapshots_read(struct bch_fs *c) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ ret = bch2_trans_run(c, -+ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, -+ POS_MIN, 0, k, -+ bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: -+ bch2_snapshot_set_equiv(&trans, k))); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+/* -+ * Mark a snapshot as deleted, for future cleanup: -+ */ -+static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) -+{ -+ struct btree_iter iter; -+ struct bkey_i_snapshot *s; -+ int ret = 0; -+ -+ s = bch2_bkey_get_mut_typed(trans, &iter, -+ BTREE_ID_snapshots, POS(0, id), -+ 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(s); -+ if (unlikely(ret)) { -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), -+ trans->c, "missing snapshot %u", id); -+ return ret; -+ } -+ -+ /* already deleted? */ -+ if (BCH_SNAPSHOT_DELETED(&s->v)) -+ goto err; -+ -+ SET_BCH_SNAPSHOT_DELETED(&s->v, true); -+ SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); -+ s->v.subvol = 0; -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; -+ struct btree_iter tree_iter = (struct btree_iter) { NULL }; -+ struct bkey_s_c_snapshot s; -+ u32 parent_id; -+ unsigned i; -+ int ret = 0; -+ -+ s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), -+ BTREE_ITER_INTENT, snapshot); -+ ret = bkey_err(s); -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, -+ "missing snapshot %u", id); -+ -+ if (ret) -+ goto err; -+ -+ BUG_ON(!BCH_SNAPSHOT_DELETED(s.v)); -+ parent_id = le32_to_cpu(s.v->parent); -+ -+ if (parent_id) { -+ struct bkey_i_snapshot *parent; -+ -+ parent = bch2_bkey_get_mut_typed(trans, &p_iter, -+ BTREE_ID_snapshots, POS(0, parent_id), -+ 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(parent); -+ if (unlikely(ret)) { -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, -+ "missing snapshot %u", parent_id); -+ goto err; -+ } -+ -+ for (i = 0; i < 2; i++) -+ if (le32_to_cpu(parent->v.children[i]) == id) -+ break; -+ -+ if (i == 2) -+ bch_err(c, "snapshot %u missing child pointer to %u", -+ parent_id, id); -+ else -+ parent->v.children[i] = 0; -+ -+ if (le32_to_cpu(parent->v.children[0]) < -+ le32_to_cpu(parent->v.children[1])) -+ swap(parent->v.children[0], -+ parent->v.children[1]); -+ } else { -+ /* -+ * We're deleting the root of a snapshot tree: update the -+ * snapshot_tree entry to point to the new root, or delete it if -+ * this is the last snapshot ID in this tree: -+ */ -+ struct bkey_i_snapshot_tree *s_t; -+ -+ BUG_ON(s.v->children[1]); -+ -+ s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, -+ BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), -+ 0, snapshot_tree); -+ ret = PTR_ERR_OR_ZERO(s_t); -+ if (ret) -+ goto err; -+ -+ if (s.v->children[0]) { -+ s_t->v.root_snapshot = s.v->children[0]; -+ } else { -+ s_t->k.type = KEY_TYPE_deleted; -+ set_bkey_val_u64s(&s_t->k, 0); -+ } -+ } -+ -+ ret = bch2_btree_delete_at(trans, &iter, 0); -+err: -+ bch2_trans_iter_exit(trans, &tree_iter); -+ bch2_trans_iter_exit(trans, &p_iter); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, -+ u32 *new_snapids, -+ u32 *snapshot_subvols, -+ unsigned nr_snapids) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_i_snapshot *n; -+ struct bkey_s_c k; -+ unsigned i, j; -+ u32 depth = bch2_snapshot_depth(c, parent); -+ int ret; -+ -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, -+ POS_MIN, BTREE_ITER_INTENT); -+ k = bch2_btree_iter_peek(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ for (i = 0; i < nr_snapids; i++) { -+ k = bch2_btree_iter_prev_slot(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (!k.k || !k.k->p.offset) { -+ ret = -BCH_ERR_ENOSPC_snapshot_create; -+ goto err; -+ } -+ -+ n = bch2_bkey_alloc(trans, &iter, 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(n); -+ if (ret) -+ goto err; -+ -+ n->v.flags = 0; -+ n->v.parent = cpu_to_le32(parent); -+ n->v.subvol = cpu_to_le32(snapshot_subvols[i]); -+ n->v.tree = cpu_to_le32(tree); -+ n->v.depth = cpu_to_le32(depth); -+ -+ for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) -+ n->v.skip[j] = cpu_to_le32(snapshot_skiplist_get(c, parent)); -+ -+ bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_int); -+ SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); -+ -+ ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, -+ bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); -+ if (ret) -+ goto err; -+ -+ new_snapids[i] = iter.pos.offset; -+ } -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+/* -+ * Create new snapshot IDs as children of an existing snapshot ID: -+ */ -+static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, -+ u32 *new_snapids, -+ u32 *snapshot_subvols, -+ unsigned nr_snapids) -+{ -+ struct btree_iter iter; -+ struct bkey_i_snapshot *n_parent; -+ int ret = 0; -+ -+ n_parent = bch2_bkey_get_mut_typed(trans, &iter, -+ BTREE_ID_snapshots, POS(0, parent), -+ 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(n_parent); -+ if (unlikely(ret)) { -+ if (bch2_err_matches(ret, ENOENT)) -+ bch_err(trans->c, "snapshot %u not found", parent); -+ return ret; -+ } -+ -+ if (n_parent->v.children[0] || n_parent->v.children[1]) { -+ bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), -+ new_snapids, snapshot_subvols, nr_snapids); -+ if (ret) -+ goto err; -+ -+ n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); -+ n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); -+ n_parent->v.subvol = 0; -+ SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+/* -+ * Create a snapshot node that is the root of a new tree: -+ */ -+static int bch2_snapshot_node_create_tree(struct btree_trans *trans, -+ u32 *new_snapids, -+ u32 *snapshot_subvols, -+ unsigned nr_snapids) -+{ -+ struct bkey_i_snapshot_tree *n_tree; -+ int ret; -+ -+ n_tree = __snapshot_tree_create(trans); -+ ret = PTR_ERR_OR_ZERO(n_tree) ?: -+ create_snapids(trans, 0, n_tree->k.p.offset, -+ new_snapids, snapshot_subvols, nr_snapids); -+ if (ret) -+ return ret; -+ -+ n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); -+ n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); -+ return 0; -+} -+ -+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, -+ u32 *new_snapids, -+ u32 *snapshot_subvols, -+ unsigned nr_snapids) -+{ -+ BUG_ON((parent == 0) != (nr_snapids == 1)); -+ BUG_ON((parent != 0) != (nr_snapids == 2)); -+ -+ return parent -+ ? bch2_snapshot_node_create_children(trans, parent, -+ new_snapids, snapshot_subvols, nr_snapids) -+ : bch2_snapshot_node_create_tree(trans, -+ new_snapids, snapshot_subvols, nr_snapids); -+ -+} -+ -+static int snapshot_delete_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ snapshot_id_list *deleted, -+ snapshot_id_list *equiv_seen, -+ struct bpos *last_pos) -+{ -+ struct bch_fs *c = trans->c; -+ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); -+ -+ if (!bkey_eq(k.k->p, *last_pos)) -+ equiv_seen->nr = 0; -+ *last_pos = k.k->p; -+ -+ if (snapshot_list_has_id(deleted, k.k->p.snapshot) || -+ snapshot_list_has_id(equiv_seen, equiv)) { -+ return bch2_btree_delete_at(trans, iter, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+ } else { -+ return snapshot_list_add(c, equiv_seen, equiv); -+ } -+} -+ -+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_snapshot snap; -+ u32 children[2]; -+ int ret; -+ -+ if (k.k->type != KEY_TYPE_snapshot) -+ return 0; -+ -+ snap = bkey_s_c_to_snapshot(k); -+ if (BCH_SNAPSHOT_DELETED(snap.v) || -+ BCH_SNAPSHOT_SUBVOL(snap.v)) -+ return 0; -+ -+ children[0] = le32_to_cpu(snap.v->children[0]); -+ children[1] = le32_to_cpu(snap.v->children[1]); -+ -+ ret = snapshot_live(trans, children[0]) ?: -+ snapshot_live(trans, children[1]); -+ if (ret < 0) -+ return ret; -+ -+ if (!ret) -+ return bch2_snapshot_node_set_deleted(trans, k.k->p.offset); -+ return 0; -+} -+ -+int bch2_delete_dead_snapshots(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_snapshot snap; -+ snapshot_id_list deleted = { 0 }; -+ u32 i, id; -+ int ret = 0; -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) { -+ ret = bch2_fs_read_write_early(c); -+ if (ret) { -+ bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret)); -+ return ret; -+ } -+ } -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ /* -+ * For every snapshot node: If we have no live children and it's not -+ * pointed to by a subvolume, delete it: -+ */ -+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, -+ POS_MIN, 0, k, -+ NULL, NULL, 0, -+ bch2_delete_redundant_snapshot(&trans, &iter, k)); -+ if (ret) { -+ bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ -+ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, -+ POS_MIN, 0, k, -+ bch2_snapshot_set_equiv(&trans, k)); -+ if (ret) { -+ bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots, -+ POS_MIN, 0, k, ret) { -+ if (k.k->type != KEY_TYPE_snapshot) -+ continue; -+ -+ snap = bkey_s_c_to_snapshot(k); -+ if (BCH_SNAPSHOT_DELETED(snap.v)) { -+ ret = snapshot_list_add(c, &deleted, k.k->p.offset); -+ if (ret) -+ break; -+ } -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (ret) { -+ bch_err(c, "error walking snapshots: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ -+ for (id = 0; id < BTREE_ID_NR; id++) { -+ struct bpos last_pos = POS_MIN; -+ snapshot_id_list equiv_seen = { 0 }; -+ -+ if (!btree_type_has_snapshots(id)) -+ continue; -+ -+ ret = for_each_btree_key_commit(&trans, iter, -+ id, POS_MIN, -+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, -+ NULL, NULL, BTREE_INSERT_NOFAIL, -+ snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)); -+ -+ darray_exit(&equiv_seen); -+ -+ if (ret) { -+ bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ } -+ -+ for (i = 0; i < deleted.nr; i++) { -+ ret = commit_do(&trans, NULL, NULL, 0, -+ bch2_snapshot_node_delete(&trans, deleted.data[i])); -+ if (ret) { -+ bch_err(c, "error deleting snapshot %u: %s", -+ deleted.data[i], bch2_err_str(ret)); -+ goto err; -+ } -+ } -+ -+ clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); -+err: -+ darray_exit(&deleted); -+ bch2_trans_exit(&trans); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static void bch2_delete_dead_snapshots_work(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); -+ -+ if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) -+ bch2_delete_dead_snapshots(c); -+ bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); -+} -+ -+void bch2_delete_dead_snapshots_async(struct bch_fs *c) -+{ -+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && -+ !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) -+ bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); -+} -+ -+static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, -+ struct btree_trans_commit_hook *h) -+{ -+ struct bch_fs *c = trans->c; -+ -+ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); -+ -+ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots) -+ return 0; -+ -+ bch2_delete_dead_snapshots_async(c); -+ return 0; -+} -+ +/* Subvolumes: */ + +int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k, @@ -82272,26 +85376,27 @@ index 000000000..811a6f428 +{ + struct bch_snapshot snap; + -+ return snapshot_lookup(trans, snapshot, &snap) ?: ++ return bch2_snapshot_lookup(trans, snapshot, &snap) ?: + bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); +} + -+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, ++int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, + u32 *snapid) +{ + struct btree_iter iter; -+ struct bkey_s_c k; ++ struct bkey_s_c_subvolume subvol; + int ret; + -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol), -+ BTREE_ITER_CACHED| -+ BTREE_ITER_WITH_UPDATES); -+ ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -BCH_ERR_ENOENT_subvolume; ++ subvol = bch2_bkey_get_iter_typed(trans, &iter, ++ BTREE_ID_subvolumes, POS(0, subvolid), ++ BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES, ++ subvolume); ++ ret = bkey_err(subvol); ++ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, ++ "missing subvolume %u", subvolid); + + if (likely(!ret)) -+ *snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); -+ else if (bch2_err_matches(ret, ENOENT)) -+ bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol); ++ *snapid = le32_to_cpu(subvol.v->snapshot); + bch2_trans_iter_exit(trans, &iter); + return ret; +} @@ -82321,7 +85426,12 @@ index 000000000..811a6f428 +} + +/* -+ * Scan for subvolumes with parent @subvolid_to_delete, reparent: ++ * Separate from the snapshot tree in the snapshots btree, we record the tree ++ * structure of how snapshot subvolumes were created - the parent subvolume of ++ * each snapshot subvolume. ++ * ++ * When a subvolume is deleted, we scan for child subvolumes and reparant them, ++ * to avoid dangling references: + */ +static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete) +{ @@ -82567,10 +85677,10 @@ index 000000000..811a6f428 +} diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h new file mode 100644 -index 000000000..6905e91a9 +index 000000000..8d4c50f4c --- /dev/null +++ b/fs/bcachefs/subvolume.h -@@ -0,0 +1,258 @@ +@@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_H +#define _BCACHEFS_SUBVOLUME_H @@ -82580,225 +85690,8 @@ index 000000000..6905e91a9 + +enum bkey_invalid_flags; + -+void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+ -+#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ -+ .key_invalid = bch2_snapshot_tree_invalid, \ -+ .val_to_text = bch2_snapshot_tree_to_text, \ -+ .min_val_size = 8, \ -+}) -+ -+int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); -+ -+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, struct bkey_s_c, unsigned); -+ -+#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ -+ .key_invalid = bch2_snapshot_invalid, \ -+ .val_to_text = bch2_snapshot_to_text, \ -+ .atomic_trigger = bch2_mark_snapshot, \ -+ .min_val_size = 24, \ -+}) -+ -+static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) -+{ -+ return &t->s[U32_MAX - id]; -+} -+ -+static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) -+{ -+ return __snapshot_t(rcu_dereference(c->snapshots), id); -+} -+ -+static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) -+{ -+ rcu_read_lock(); -+ id = snapshot_t(c, id)->tree; -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) -+{ -+ return snapshot_t(c, id)->parent; -+} -+ -+static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) -+{ -+ rcu_read_lock(); -+ id = __bch2_snapshot_parent_early(c, id); -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ u32 parent = snapshot_t(c, id)->parent; -+ -+ if (parent && -+ snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) -+ panic("id %u depth=%u parent %u depth=%u\n", -+ id, snapshot_t(c, id)->depth, -+ parent, snapshot_t(c, parent)->depth); -+ -+ return parent; -+#else -+ return snapshot_t(c, id)->parent; -+#endif -+} -+ -+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) -+{ -+ rcu_read_lock(); -+ id = __bch2_snapshot_parent(c, id); -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) -+{ -+ rcu_read_lock(); -+ while (n--) -+ id = __bch2_snapshot_parent(c, id); -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) -+{ -+ u32 parent; -+ -+ rcu_read_lock(); -+ while ((parent = __bch2_snapshot_parent(c, id))) -+ id = parent; -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) -+{ -+ return snapshot_t(c, id)->equiv; -+} -+ -+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) -+{ -+ rcu_read_lock(); -+ id = __bch2_snapshot_equiv(c, id); -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) -+{ -+ return id == bch2_snapshot_equiv(c, id); -+} -+ -+static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) -+{ -+ const struct snapshot_t *s; -+ bool ret; -+ -+ rcu_read_lock(); -+ s = snapshot_t(c, id); -+ ret = s->children[0]; -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) -+{ -+ return !bch2_snapshot_is_internal_node(c, id); -+} -+ -+static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) -+{ -+ const struct snapshot_t *s; -+ u32 parent = __bch2_snapshot_parent(c, id); -+ -+ if (!parent) -+ return 0; -+ -+ s = snapshot_t(c, __bch2_snapshot_parent(c, id)); -+ if (id == s->children[0]) -+ return s->children[1]; -+ if (id == s->children[1]) -+ return s->children[0]; -+ return 0; -+} -+ -+bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); -+ -+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) -+{ -+ return id == ancestor -+ ? true -+ : __bch2_snapshot_is_ancestor(c, id, ancestor); -+} -+ -+static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) -+{ -+ const struct snapshot_t *t; -+ bool ret; -+ -+ rcu_read_lock(); -+ t = snapshot_t(c, id); -+ ret = (t->children[0]|t->children[1]) != 0; -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) -+{ -+ u32 *i; -+ -+ darray_for_each(*s, i) -+ if (*i == id) -+ return true; -+ return false; -+} -+ -+static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) -+{ -+ u32 *i; -+ -+ darray_for_each(*s, i) -+ if (bch2_snapshot_is_ancestor(c, id, *i)) -+ return true; -+ return false; -+} -+ -+static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) -+{ -+ int ret; -+ -+ BUG_ON(snapshot_list_has_id(s, id)); -+ ret = darray_push(s, id); -+ if (ret) -+ bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); -+ return ret; -+} -+ -+int bch2_check_snapshot_trees(struct bch_fs *); -+int bch2_check_snapshots(struct bch_fs *); +int bch2_check_subvols(struct bch_fs *); + -+void bch2_fs_snapshots_exit(struct bch_fs *); -+int bch2_snapshots_read(struct bch_fs *); -+ +int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, + unsigned, struct printbuf *); +void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -82811,14 +85704,8 @@ index 000000000..6905e91a9 + +int bch2_subvolume_get(struct btree_trans *, unsigned, + bool, int, struct bch_subvolume *); -+int bch2_snapshot_get_subvol(struct btree_trans *, u32, -+ struct bch_subvolume *); +int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); + -+/* only exported for tests: */ -+int bch2_snapshot_node_create(struct btree_trans *, u32, -+ u32 *, u32 *, unsigned); -+ +int bch2_delete_dead_snapshots(struct bch_fs *); +void bch2_delete_dead_snapshots_async(struct bch_fs *); + @@ -82868,15 +85755,13 @@ index 000000000..86833445a +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c new file mode 100644 -index 000000000..d2d3eba4d +index 000000000..f01883e78 --- /dev/null +++ b/fs/bcachefs/super-io.c -@@ -0,0 +1,1714 @@ +@@ -0,0 +1,1265 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" -+#include "btree_update_interior.h" -+#include "buckets.h" +#include "checksum.h" +#include "counters.h" +#include "disk_groups.h" @@ -82884,12 +85769,13 @@ index 000000000..d2d3eba4d +#include "error.h" +#include "io.h" +#include "journal.h" -+#include "journal_io.h" +#include "journal_sb.h" +#include "journal_seq_blacklist.h" +#include "recovery.h" +#include "replicas.h" +#include "quota.h" ++#include "sb-clean.h" ++#include "sb-members.h" +#include "super-io.h" +#include "super.h" +#include "trace.h" @@ -82898,6 +85784,9 @@ index 000000000..d2d3eba4d +#include +#include + ++static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { ++}; ++ +struct bch2_metadata_version { + u16 version; + const char *name; @@ -83036,7 +85925,8 @@ index 000000000..d2d3eba4d +{ + kfree(sb->bio); + if (!IS_ERR_OR_NULL(sb->bdev)) -+ blkdev_put(sb->bdev, sb->mode); ++ blkdev_put(sb->bdev, sb->holder); ++ kfree(sb->holder); + + kfree(sb->sb); + memset(sb, 0, sizeof(*sb)); @@ -83073,8 +85963,14 @@ index 000000000..d2d3eba4d + if (dynamic_fault("bcachefs:add:super_realloc")) + return -BCH_ERR_ENOMEM_sb_realloc_injected; + ++ new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); ++ if (!new_sb) ++ return -BCH_ERR_ENOMEM_sb_buf_realloc; ++ ++ sb->sb = new_sb; ++ + if (sb->have_bio) { -+ unsigned nr_bvecs = DIV_ROUND_UP(new_buffer_size, PAGE_SIZE); ++ unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size); + + bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); + if (!bio) @@ -83086,11 +85982,6 @@ index 000000000..d2d3eba4d + sb->bio = bio; + } + -+ new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); -+ if (!new_sb) -+ return -BCH_ERR_ENOMEM_sb_buf_realloc; -+ -+ sb->sb = new_sb; + sb->buffer_size = new_buffer_size; + + return 0; @@ -83135,16 +86026,13 @@ index 000000000..d2d3eba4d + +/* Superblock validate: */ + -+static inline void __bch2_sb_layout_size_assert(void) -+{ -+ BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); -+} -+ +static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) +{ + u64 offset, prev_offset, max_sectors; + unsigned i; + ++ BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); ++ + if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) && + !uuid_equal(&layout->magic, &BCHFS_MAGIC)) { + prt_printf(out, "Not a bcachefs superblock layout"); @@ -83425,7 +86313,9 @@ index 000000000..d2d3eba4d + d = (src_f ? le32_to_cpu(src_f->u64s) : 0) - + (dst_f ? le32_to_cpu(dst_f->u64s) : 0); + if (d > 0) { -+ int ret = bch2_sb_realloc(dst_handle, le32_to_cpu(dst_handle->sb->u64s) + d); ++ int ret = bch2_sb_realloc(dst_handle, ++ le32_to_cpu(dst_handle->sb->u64s) + d); ++ + if (ret) + return ret; + @@ -83539,8 +86429,11 @@ index 000000000..d2d3eba4d +retry: +#endif + memset(sb, 0, sizeof(*sb)); -+ sb->mode = FMODE_READ; ++ sb->mode = BLK_OPEN_READ; + sb->have_bio = true; ++ sb->holder = kmalloc(1, GFP_KERNEL); ++ if (!sb->holder) ++ return -ENOMEM; + +#ifndef __KERNEL__ + if (opt_get(*opts, direct_io) == false) @@ -83548,18 +86441,18 @@ index 000000000..d2d3eba4d +#endif + + if (!opt_get(*opts, noexcl)) -+ sb->mode |= FMODE_EXCL; ++ sb->mode |= BLK_OPEN_EXCL; + + if (!opt_get(*opts, nochanges)) -+ sb->mode |= FMODE_WRITE; ++ sb->mode |= BLK_OPEN_WRITE; + -+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); ++ sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (IS_ERR(sb->bdev) && + PTR_ERR(sb->bdev) == -EACCES && + opt_get(*opts, read_only)) { -+ sb->mode &= ~FMODE_WRITE; ++ sb->mode &= ~BLK_OPEN_WRITE; + -+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); ++ sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); + if (!IS_ERR(sb->bdev)) + opt_set(*opts, nochanges, true); + } @@ -83882,235 +86775,6 @@ index 000000000..d2d3eba4d + mutex_unlock(&c->sb_lock); +} + -+/* BCH_SB_FIELD_members: */ -+ -+static int bch2_sb_members_validate(struct bch_sb *sb, -+ struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ struct bch_sb_field_members *mi = field_to_type(f, members); -+ unsigned i; -+ -+ if ((void *) (mi->members + sb->nr_devices) > -+ vstruct_end(&mi->field)) { -+ prt_printf(err, "too many devices for section size"); -+ return -BCH_ERR_invalid_sb_members; -+ } -+ -+ for (i = 0; i < sb->nr_devices; i++) { -+ struct bch_member *m = mi->members + i; -+ -+ if (!bch2_member_exists(m)) -+ continue; -+ -+ if (le64_to_cpu(m->nbuckets) > LONG_MAX) { -+ prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", -+ i, le64_to_cpu(m->nbuckets), LONG_MAX); -+ return -BCH_ERR_invalid_sb_members; -+ } -+ -+ if (le64_to_cpu(m->nbuckets) - -+ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) { -+ prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", -+ i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS); -+ return -BCH_ERR_invalid_sb_members; -+ } -+ -+ if (le16_to_cpu(m->bucket_size) < -+ le16_to_cpu(sb->block_size)) { -+ prt_printf(err, "device %u: bucket size %u smaller than block size %u", -+ i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size)); -+ return -BCH_ERR_invalid_sb_members; -+ } -+ -+ if (le16_to_cpu(m->bucket_size) < -+ BCH_SB_BTREE_NODE_SIZE(sb)) { -+ prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", -+ i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); -+ return -BCH_ERR_invalid_sb_members; -+ } -+ } -+ -+ return 0; -+} -+ -+static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_members *mi = field_to_type(f, members); -+ struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb); -+ unsigned i; -+ -+ for (i = 0; i < sb->nr_devices; i++) { -+ struct bch_member *m = mi->members + i; -+ unsigned data_have = bch2_sb_dev_has_data(sb, i); -+ u64 bucket_size = le16_to_cpu(m->bucket_size); -+ u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size; -+ -+ if (!bch2_member_exists(m)) -+ continue; -+ -+ prt_printf(out, "Device:"); -+ prt_tab(out); -+ prt_printf(out, "%u", i); -+ prt_newline(out); -+ -+ printbuf_indent_add(out, 2); -+ -+ prt_printf(out, "UUID:"); -+ prt_tab(out); -+ pr_uuid(out, m->uuid.b); -+ prt_newline(out); -+ -+ prt_printf(out, "Size:"); -+ prt_tab(out); -+ prt_units_u64(out, device_size << 9); -+ prt_newline(out); -+ -+ prt_printf(out, "Bucket size:"); -+ prt_tab(out); -+ prt_units_u64(out, bucket_size << 9); -+ prt_newline(out); -+ -+ prt_printf(out, "First bucket:"); -+ prt_tab(out); -+ prt_printf(out, "%u", le16_to_cpu(m->first_bucket)); -+ prt_newline(out); -+ -+ prt_printf(out, "Buckets:"); -+ prt_tab(out); -+ prt_printf(out, "%llu", le64_to_cpu(m->nbuckets)); -+ prt_newline(out); -+ -+ prt_printf(out, "Last mount:"); -+ prt_tab(out); -+ if (m->last_mount) -+ pr_time(out, le64_to_cpu(m->last_mount)); -+ else -+ prt_printf(out, "(never)"); -+ prt_newline(out); -+ -+ prt_printf(out, "State:"); -+ prt_tab(out); -+ prt_printf(out, "%s", -+ BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR -+ ? bch2_member_states[BCH_MEMBER_STATE(m)] -+ : "unknown"); -+ prt_newline(out); -+ -+ prt_printf(out, "Label:"); -+ prt_tab(out); -+ if (BCH_MEMBER_GROUP(m)) { -+ unsigned idx = BCH_MEMBER_GROUP(m) - 1; -+ -+ if (idx < disk_groups_nr(gi)) -+ prt_printf(out, "%s (%u)", -+ gi->entries[idx].label, idx); -+ else -+ prt_printf(out, "(bad disk labels section)"); -+ } else { -+ prt_printf(out, "(none)"); -+ } -+ prt_newline(out); -+ -+ prt_printf(out, "Data allowed:"); -+ prt_tab(out); -+ if (BCH_MEMBER_DATA_ALLOWED(m)) -+ prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m)); -+ else -+ prt_printf(out, "(none)"); -+ prt_newline(out); -+ -+ prt_printf(out, "Has data:"); -+ prt_tab(out); -+ if (data_have) -+ prt_bitflags(out, bch2_data_types, data_have); -+ else -+ prt_printf(out, "(none)"); -+ prt_newline(out); -+ -+ prt_printf(out, "Discard:"); -+ prt_tab(out); -+ prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m)); -+ prt_newline(out); -+ -+ prt_printf(out, "Freespace initialized:"); -+ prt_tab(out); -+ prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m)); -+ prt_newline(out); -+ -+ printbuf_indent_sub(out, 2); -+ } -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_members = { -+ .validate = bch2_sb_members_validate, -+ .to_text = bch2_sb_members_to_text, -+}; -+ -+/* BCH_SB_FIELD_crypt: */ -+ -+static int bch2_sb_crypt_validate(struct bch_sb *sb, -+ struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); -+ -+ if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { -+ prt_printf(err, "wrong size (got %zu should be %zu)", -+ vstruct_bytes(&crypt->field), sizeof(*crypt)); -+ return -BCH_ERR_invalid_sb_crypt; -+ } -+ -+ if (BCH_CRYPT_KDF_TYPE(crypt)) { -+ prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); -+ return -BCH_ERR_invalid_sb_crypt; -+ } -+ -+ return 0; -+} -+ -+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); -+ -+ prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); -+ prt_newline(out); -+ prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); -+ prt_newline(out); -+ prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); -+ prt_newline(out); -+ prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); -+ prt_newline(out); -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { -+ .validate = bch2_sb_crypt_validate, -+ .to_text = bch2_sb_crypt_to_text, -+}; -+ -+/* BCH_SB_FIELD_clean: */ -+ -+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) -+{ -+ struct jset_entry *entry; -+ int ret; -+ -+ for (entry = clean->start; -+ entry < (struct jset_entry *) vstruct_end(&clean->field); -+ entry = vstruct_next(entry)) { -+ ret = bch2_journal_entry_validate(c, NULL, entry, -+ le16_to_cpu(c->disk_sb.sb->version), -+ BCH_SB_BIG_ENDIAN(c->disk_sb.sb), -+ write); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ +/* Downgrade if superblock is at a higher version than currently supported: */ +void bch2_sb_maybe_downgrade(struct bch_fs *c) +{ @@ -84137,232 +86801,6 @@ index 000000000..d2d3eba4d + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); +} + -+int bch2_fs_mark_dirty(struct bch_fs *c) -+{ -+ int ret; -+ -+ /* -+ * Unconditionally write superblock, to verify it hasn't changed before -+ * we go rw: -+ */ -+ -+ mutex_lock(&c->sb_lock); -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ -+ bch2_sb_maybe_downgrade(c); -+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); -+ -+ ret = bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) -+{ -+ struct jset_entry *entry = *end; -+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); -+ -+ memset(entry, 0, u64s * sizeof(u64)); -+ /* -+ * The u64s field counts from the start of data, ignoring the shared -+ * fields. -+ */ -+ entry->u64s = cpu_to_le16(u64s - 1); -+ -+ *end = vstruct_next(*end); -+ return entry; -+} -+ -+void bch2_journal_super_entries_add_common(struct bch_fs *c, -+ struct jset_entry **end, -+ u64 journal_seq) -+{ -+ struct bch_dev *ca; -+ unsigned i, dev; -+ -+ percpu_down_read(&c->mark_lock); -+ -+ if (!journal_seq) { -+ for (i = 0; i < ARRAY_SIZE(c->usage); i++) -+ bch2_fs_usage_acc_to_base(c, i); -+ } else { -+ bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); -+ } -+ -+ { -+ struct jset_entry_usage *u = -+ container_of(jset_entry_init(end, sizeof(*u)), -+ struct jset_entry_usage, entry); -+ -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = BCH_FS_USAGE_inodes; -+ u->v = cpu_to_le64(c->usage_base->nr_inodes); -+ } -+ -+ { -+ struct jset_entry_usage *u = -+ container_of(jset_entry_init(end, sizeof(*u)), -+ struct jset_entry_usage, entry); -+ -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = BCH_FS_USAGE_key_version; -+ u->v = cpu_to_le64(atomic64_read(&c->key_version)); -+ } -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) { -+ struct jset_entry_usage *u = -+ container_of(jset_entry_init(end, sizeof(*u)), -+ struct jset_entry_usage, entry); -+ -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = BCH_FS_USAGE_reserved; -+ u->entry.level = i; -+ u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ struct jset_entry_data_usage *u = -+ container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), -+ struct jset_entry_data_usage, entry); -+ -+ u->entry.type = BCH_JSET_ENTRY_data_usage; -+ u->v = cpu_to_le64(c->usage_base->replicas[i]); -+ unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), -+ "embedded variable length struct"); -+ } -+ -+ for_each_member_device(ca, c, dev) { -+ unsigned b = sizeof(struct jset_entry_dev_usage) + -+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; -+ struct jset_entry_dev_usage *u = -+ container_of(jset_entry_init(end, b), -+ struct jset_entry_dev_usage, entry); -+ -+ u->entry.type = BCH_JSET_ENTRY_dev_usage; -+ u->dev = cpu_to_le32(dev); -+ u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); -+ -+ for (i = 0; i < BCH_DATA_NR; i++) { -+ u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); -+ u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); -+ u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); -+ } -+ } -+ -+ percpu_up_read(&c->mark_lock); -+ -+ for (i = 0; i < 2; i++) { -+ struct jset_entry_clock *clock = -+ container_of(jset_entry_init(end, sizeof(*clock)), -+ struct jset_entry_clock, entry); -+ -+ clock->entry.type = BCH_JSET_ENTRY_clock; -+ clock->rw = i; -+ clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); -+ } -+} -+ -+void bch2_fs_mark_clean(struct bch_fs *c) -+{ -+ struct bch_sb_field_clean *sb_clean; -+ struct jset_entry *entry; -+ unsigned u64s; -+ int ret; -+ -+ mutex_lock(&c->sb_lock); -+ if (BCH_SB_CLEAN(c->disk_sb.sb)) -+ goto out; -+ -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, true); -+ -+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); -+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); -+ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); -+ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); -+ -+ u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; -+ -+ sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); -+ if (!sb_clean) { -+ bch_err(c, "error resizing superblock while setting filesystem clean"); -+ goto out; -+ } -+ -+ sb_clean->flags = 0; -+ sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); -+ -+ /* Trying to catch outstanding bug: */ -+ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); -+ -+ entry = sb_clean->start; -+ bch2_journal_super_entries_add_common(c, &entry, 0); -+ entry = bch2_btree_roots_to_journal_entries(c, entry, entry); -+ BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); -+ -+ memset(entry, 0, -+ vstruct_end(&sb_clean->field) - (void *) entry); -+ -+ /* -+ * this should be in the write path, and we should be validating every -+ * superblock section: -+ */ -+ ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); -+ if (ret) { -+ bch_err(c, "error writing marking filesystem clean: validate error"); -+ goto out; -+ } -+ -+ bch2_write_super(c); -+out: -+ mutex_unlock(&c->sb_lock); -+} -+ -+static int bch2_sb_clean_validate(struct bch_sb *sb, -+ struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ struct bch_sb_field_clean *clean = field_to_type(f, clean); -+ -+ if (vstruct_bytes(&clean->field) < sizeof(*clean)) { -+ prt_printf(err, "wrong size (got %zu should be %zu)", -+ vstruct_bytes(&clean->field), sizeof(*clean)); -+ return -BCH_ERR_invalid_sb_clean; -+ } -+ -+ return 0; -+} -+ -+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_clean *clean = field_to_type(f, clean); -+ struct jset_entry *entry; -+ -+ prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); -+ prt_newline(out); -+ prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); -+ prt_newline(out); -+ -+ for (entry = clean->start; -+ entry != vstruct_end(&clean->field); -+ entry = vstruct_next(entry)) { -+ if (entry->type == BCH_JSET_ENTRY_btree_keys && -+ !entry->u64s) -+ continue; -+ -+ bch2_journal_entry_to_text(out, NULL, entry); -+ prt_newline(out); -+ } -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_clean = { -+ .validate = bch2_sb_clean_validate, -+ .to_text = bch2_sb_clean_to_text, -+}; -+ +static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { +#define x(f, nr) \ + [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, @@ -84588,10 +87026,10 @@ index 000000000..d2d3eba4d +} diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h new file mode 100644 -index 000000000..904adea6a +index 000000000..d51c0a195 --- /dev/null +++ b/fs/bcachefs/super-io.h -@@ -0,0 +1,142 @@ +@@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUPER_IO_H +#define _BCACHEFS_SUPER_IO_H @@ -84652,6 +87090,7 @@ index 000000000..904adea6a +static inline __le64 bch2_sb_magic(struct bch_fs *c) +{ + __le64 ret; ++ + memcpy(&ret, &c->sb.uuid, sizeof(ret)); + return ret; +} @@ -84715,19 +87154,9 @@ index 000000000..904adea6a + }; +} + -+/* BCH_SB_FIELD_clean: */ -+ -+void bch2_journal_super_entries_add_common(struct bch_fs *, -+ struct jset_entry **, u64); -+ -+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); -+ +void bch2_sb_maybe_downgrade(struct bch_fs *); +void bch2_sb_upgrade(struct bch_fs *, unsigned); + -+int bch2_fs_mark_dirty(struct bch_fs *); -+void bch2_fs_mark_clean(struct bch_fs *); -+ +void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, + struct bch_sb_field *); +void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); @@ -84736,10 +87165,10 @@ index 000000000..904adea6a +#endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c new file mode 100644 -index 000000000..eee56969c +index 000000000..604248659 --- /dev/null +++ b/fs/bcachefs/super.c -@@ -0,0 +1,2007 @@ +@@ -0,0 +1,2015 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs setup/teardown code, and some metadata io - read a superblock and @@ -84755,6 +87184,7 @@ index 000000000..eee56969c +#include "bkey_sort.h" +#include "btree_cache.h" +#include "btree_gc.h" ++#include "btree_journal_iter.h" +#include "btree_key_cache.h" +#include "btree_update_interior.h" +#include "btree_io.h" @@ -84772,6 +87202,8 @@ index 000000000..eee56969c +#include "error.h" +#include "fs.h" +#include "fs-io.h" ++#include "fs-io-buffered.h" ++#include "fs-io-direct.h" +#include "fsck.h" +#include "inode.h" +#include "io.h" @@ -84786,6 +87218,8 @@ index 000000000..eee56969c +#include "rebalance.h" +#include "recovery.h" +#include "replicas.h" ++#include "sb-clean.h" ++#include "snapshot.h" +#include "subvolume.h" +#include "super.h" +#include "super-io.h" @@ -85211,6 +87645,8 @@ index 000000000..eee56969c + bch2_fs_counters_exit(c); + bch2_fs_snapshots_exit(c); + bch2_fs_quota_exit(c); ++ bch2_fs_fs_io_direct_exit(c); ++ bch2_fs_fs_io_buffered_exit(c); + bch2_fs_fsio_exit(c); + bch2_fs_ec_exit(c); + bch2_fs_encryption_exit(c); @@ -85310,19 +87746,14 @@ index 000000000..eee56969c + cancel_work_sync(&ca->io_error_work); + + cancel_work_sync(&c->read_only_work); -+ -+ for (i = 0; i < c->sb.nr_devices; i++) { -+ struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); -+ -+ if (ca) -+ bch2_free_super(&ca->disk_sb); -+ } +} + +void bch2_fs_free(struct bch_fs *c) +{ + unsigned i; + ++ BUG_ON(!test_bit(BCH_FS_STOPPING, &c->flags)); ++ + mutex_lock(&bch_fs_list_lock); + list_del(&c->list); + mutex_unlock(&bch_fs_list_lock); @@ -85330,9 +87761,14 @@ index 000000000..eee56969c + closure_sync(&c->cl); + closure_debug_destroy(&c->cl); + -+ for (i = 0; i < c->sb.nr_devices; i++) -+ if (c->devs[i]) -+ bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); ++ for (i = 0; i < c->sb.nr_devices; i++) { ++ struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); ++ ++ if (ca) { ++ bch2_free_super(&ca->disk_sb); ++ bch2_dev_free(ca); ++ } ++ } + + bch_verbose(c, "shutdown complete"); + @@ -85586,7 +88022,9 @@ index 000000000..eee56969c + bch2_fs_encryption_init(c) ?: + bch2_fs_compress_init(c) ?: + bch2_fs_ec_init(c) ?: -+ bch2_fs_fsio_init(c); ++ bch2_fs_fsio_init(c) ?: ++ bch2_fs_fs_io_buffered_init(c); ++ bch2_fs_fs_io_direct_init(c); + if (ret) + goto err; + @@ -85970,8 +88408,6 @@ index 000000000..eee56969c + + /* Commit: */ + ca->disk_sb = *sb; -+ if (sb->mode & FMODE_EXCL) -+ ca->disk_sb.bdev->bd_holder = ca; + memset(sb, 0, sizeof(*sb)); + + ca->dev = ca->disk_sb.bdev->bd_dev; @@ -86742,6 +89178,7 @@ index 000000000..eee56969c +BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + ++__maybe_unused +static unsigned bch2_metadata_version = bcachefs_metadata_version_current; +module_param_named(version, bch2_metadata_version, uint, 0400); + @@ -86749,10 +89186,10 @@ index 000000000..eee56969c +module_init(bcachefs_init); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h new file mode 100644 -index 000000000..36bcb9ec2 +index 000000000..bf762df18 --- /dev/null +++ b/fs/bcachefs/super.h -@@ -0,0 +1,266 @@ +@@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUPER_H +#define _BCACHEFS_SUPER_H @@ -86763,220 +89200,6 @@ index 000000000..36bcb9ec2 + +#include + -+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) -+{ -+ return div_u64(s, ca->mi.bucket_size); -+} -+ -+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) -+{ -+ return ((sector_t) b) * ca->mi.bucket_size; -+} -+ -+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) -+{ -+ u32 remainder; -+ -+ div_u64_rem(s, ca->mi.bucket_size, &remainder); -+ return remainder; -+} -+ -+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, -+ u32 *offset) -+{ -+ return div_u64_rem(s, ca->mi.bucket_size, offset); -+} -+ -+static inline bool bch2_dev_is_online(struct bch_dev *ca) -+{ -+ return !percpu_ref_is_zero(&ca->io_ref); -+} -+ -+static inline bool bch2_dev_is_readable(struct bch_dev *ca) -+{ -+ return bch2_dev_is_online(ca) && -+ ca->mi.state != BCH_MEMBER_STATE_failed; -+} -+ -+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) -+{ -+ if (!percpu_ref_tryget(&ca->io_ref)) -+ return false; -+ -+ if (ca->mi.state == BCH_MEMBER_STATE_rw || -+ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) -+ return true; -+ -+ percpu_ref_put(&ca->io_ref); -+ return false; -+} -+ -+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) -+{ -+ return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); -+} -+ -+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, -+ unsigned dev) -+{ -+ unsigned i; -+ -+ for (i = 0; i < devs.nr; i++) -+ if (devs.devs[i] == dev) -+ return true; -+ -+ return false; -+} -+ -+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, -+ unsigned dev) -+{ -+ unsigned i; -+ -+ for (i = 0; i < devs->nr; i++) -+ if (devs->devs[i] == dev) { -+ array_remove_item(devs->devs, devs->nr, i); -+ return; -+ } -+} -+ -+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, -+ unsigned dev) -+{ -+ if (!bch2_dev_list_has_dev(*devs, dev)) { -+ BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); -+ devs->devs[devs->nr++] = dev; -+ } -+} -+ -+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) -+{ -+ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; -+} -+ -+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, -+ const struct bch_devs_mask *mask) -+{ -+ struct bch_dev *ca = NULL; -+ -+ while ((*iter = mask -+ ? find_next_bit(mask->d, c->sb.nr_devices, *iter) -+ : *iter) < c->sb.nr_devices && -+ !(ca = rcu_dereference_check(c->devs[*iter], -+ lockdep_is_held(&c->state_lock)))) -+ (*iter)++; -+ -+ return ca; -+} -+ -+#define for_each_member_device_rcu(ca, c, iter, mask) \ -+ for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) -+ -+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) -+{ -+ struct bch_dev *ca; -+ -+ rcu_read_lock(); -+ if ((ca = __bch2_next_dev(c, iter, NULL))) -+ percpu_ref_get(&ca->ref); -+ rcu_read_unlock(); -+ -+ return ca; -+} -+ -+/* -+ * If you break early, you must drop your ref on the current device -+ */ -+#define for_each_member_device(ca, c, iter) \ -+ for ((iter) = 0; \ -+ (ca = bch2_get_next_dev(c, &(iter))); \ -+ percpu_ref_put(&ca->ref), (iter)++) -+ -+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, -+ unsigned *iter, -+ int state_mask) -+{ -+ struct bch_dev *ca; -+ -+ rcu_read_lock(); -+ while ((ca = __bch2_next_dev(c, iter, NULL)) && -+ (!((1 << ca->mi.state) & state_mask) || -+ !percpu_ref_tryget(&ca->io_ref))) -+ (*iter)++; -+ rcu_read_unlock(); -+ -+ return ca; -+} -+ -+#define __for_each_online_member(ca, c, iter, state_mask) \ -+ for ((iter) = 0; \ -+ (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ -+ percpu_ref_put(&ca->io_ref), (iter)++) -+ -+#define for_each_online_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, ~0) -+ -+#define for_each_rw_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) -+ -+#define for_each_readable_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, \ -+ (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) -+ -+/* -+ * If a key exists that references a device, the device won't be going away and -+ * we can omit rcu_read_lock(): -+ */ -+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) -+{ -+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); -+ -+ return rcu_dereference_check(c->devs[idx], 1); -+} -+ -+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) -+{ -+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); -+ -+ return rcu_dereference_protected(c->devs[idx], -+ lockdep_is_held(&c->sb_lock) || -+ lockdep_is_held(&c->state_lock)); -+} -+ -+/* XXX kill, move to struct bch_fs */ -+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) -+{ -+ struct bch_devs_mask devs; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ memset(&devs, 0, sizeof(devs)); -+ for_each_online_member(ca, c, i) -+ __set_bit(ca->dev_idx, devs.d); -+ return devs; -+} -+ -+static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) -+{ -+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; -+ u64 b_offset = bucket_to_sector(ca, b); -+ u64 b_end = bucket_to_sector(ca, b + 1); -+ unsigned i; -+ -+ if (!b) -+ return true; -+ -+ for (i = 0; i < layout->nr_superblocks; i++) { -+ u64 offset = le64_to_cpu(layout->sb_offset[i]); -+ u64 end = offset + (1 << layout->sb_max_size_bits); -+ -+ if (!(offset >= b_end || end <= b_offset)) -+ return true; -+ } -+ -+ return false; -+} -+ +struct bch_fs *bch2_dev_to_fs(dev_t); +struct bch_fs *bch2_uuid_to_fs(__uuid_t); + @@ -87021,10 +89244,10 @@ index 000000000..36bcb9ec2 +#endif /* _BCACHEFS_SUPER_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h new file mode 100644 -index 000000000..89419fc79 +index 000000000..08faeedba --- /dev/null +++ b/fs/bcachefs/super_types.h -@@ -0,0 +1,51 @@ +@@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUPER_TYPES_H +#define _BCACHEFS_SUPER_TYPES_H @@ -87033,6 +89256,7 @@ index 000000000..89419fc79 + struct bch_sb *sb; + struct block_device *bdev; + struct bio *bio; ++ void *holder; + size_t buffer_size; + fmode_t mode; + unsigned have_layout:1; @@ -87078,10 +89302,10 @@ index 000000000..89419fc79 +#endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c new file mode 100644 -index 000000000..740305e67 +index 000000000..941f4bcb9 --- /dev/null +++ b/fs/bcachefs/sysfs.c -@@ -0,0 +1,1064 @@ +@@ -0,0 +1,1059 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcache sysfs interfaces @@ -87332,7 +89556,6 @@ index 000000000..740305e67 +read_attribute(io_timers_read); +read_attribute(io_timers_write); + -+read_attribute(data_jobs); +read_attribute(moving_ctxts); + +#ifdef CONFIG_BCACHEFS_TESTS @@ -87542,9 +89765,6 @@ index 000000000..740305e67 + if (attr == &sysfs_io_timers_write) + bch2_io_timers_to_text(out, &c->io_clock[WRITE]); + -+ if (attr == &sysfs_data_jobs) -+ bch2_data_jobs_to_text(out, c); -+ + if (attr == &sysfs_moving_ctxts) + bch2_fs_moving_ctxts_to_text(out, c); + @@ -87765,7 +89985,6 @@ index 000000000..740305e67 + &sysfs_rebalance_work, + sysfs_pd_controller_files(rebalance), + -+ &sysfs_data_jobs, + &sysfs_moving_ctxts, + + &sysfs_internal_uuid, @@ -88202,17 +90421,17 @@ index 000000000..222cd5062 +#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c new file mode 100644 -index 000000000..cef23d2cc +index 000000000..72389c737 --- /dev/null +++ b/fs/bcachefs/tests.c -@@ -0,0 +1,939 @@ +@@ -0,0 +1,970 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifdef CONFIG_BCACHEFS_TESTS + +#include "bcachefs.h" +#include "btree_update.h" +#include "journal_reclaim.h" -+#include "subvolume.h" ++#include "snapshot.h" +#include "tests.h" + +#include "linux/kthread.h" @@ -88711,6 +90930,36 @@ index 000000000..cef23d2cc + __test_extent_overwrite(c, 32, 64, 32, 128); +} + ++static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid) ++{ ++ struct bkey_i_cookie k; ++ int ret; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k_i.k.p.inode = inum; ++ k.k_i.k.p.offset = start + len; ++ k.k_i.k.p.snapshot = snapid; ++ k.k_i.k.size = len; ++ ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ bch2_btree_insert_nonextent(&trans, BTREE_ID_extents, &k.k_i, ++ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); ++ if (ret) ++ bch_err_fn(c, ret); ++ return ret; ++} ++ ++static int test_extent_create_overlapping(struct bch_fs *c, u64 inum) ++{ ++ return insert_test_overlapping_extent(c, inum, 0, 16, U32_MAX - 2) ?: /* overwrite entire */ ++ insert_test_overlapping_extent(c, inum, 2, 8, U32_MAX - 2) ?: ++ insert_test_overlapping_extent(c, inum, 4, 4, U32_MAX) ?: ++ insert_test_overlapping_extent(c, inum, 32, 8, U32_MAX - 2) ?: /* overwrite front/back */ ++ insert_test_overlapping_extent(c, inum, 36, 8, U32_MAX) ?: ++ insert_test_overlapping_extent(c, inum, 60, 8, U32_MAX - 2) ?: ++ insert_test_overlapping_extent(c, inum, 64, 8, U32_MAX); ++} ++ +/* snapshot unit tests */ + +/* Test skipping over keys in unrelated snapshots: */ @@ -89109,6 +91358,7 @@ index 000000000..cef23d2cc + perf_test(test_extent_overwrite_back); + perf_test(test_extent_overwrite_middle); + perf_test(test_extent_overwrite_all); ++ perf_test(test_extent_create_overlapping); + + perf_test(test_snapshots); + @@ -89168,7 +91418,7 @@ index 000000000..c73b18aea +#endif /* _BCACHEFS_TEST_H */ diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c new file mode 100644 -index 000000000..d294b3d71 +index 000000000..33efa6005 --- /dev/null +++ b/fs/bcachefs/trace.c @@ -0,0 +1,16 @@ @@ -89182,18 +91432,18 @@ index 000000000..d294b3d71 +#include "btree_update_interior.h" +#include "keylist.h" +#include "opts.h" ++#include "six.h" + +#include -+#include + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h new file mode 100644 -index 000000000..a743ab477 +index 000000000..97fe77423 --- /dev/null +++ b/fs/bcachefs/trace.h -@@ -0,0 +1,1247 @@ +@@ -0,0 +1,1265 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bcachefs @@ -89599,29 +91849,43 @@ index 000000000..a743ab477 + __field(u8, level ) + TRACE_BPOS_entries(pos) + __array(char, node, 24 ) ++ __field(u8, self_read_count ) ++ __field(u8, self_intent_count) ++ __field(u8, read_count ) ++ __field(u8, intent_count ) + __field(u32, iter_lock_seq ) + __field(u32, node_lock_seq ) + ), + + TP_fast_assign( + struct btree *b = btree_path_node(path, level); ++ struct six_lock_count c; + + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + __entry->level = path->level; + TRACE_BPOS_assign(pos, path->pos); -+ if (IS_ERR(b)) ++ ++ c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), ++ __entry->self_read_count = c.n[SIX_LOCK_read]; ++ __entry->self_intent_count = c.n[SIX_LOCK_intent]; ++ ++ if (IS_ERR(b)) { + strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node)); -+ else ++ } else { ++ c = six_lock_counts(&path->l[level].b->c.lock); ++ __entry->read_count = c.n[SIX_LOCK_read]; ++ __entry->intent_count = c.n[SIX_LOCK_intent]; + scnprintf(__entry->node, sizeof(__entry->node), "%px", b); ++ } + __entry->iter_lock_seq = path->l[level].lock_seq; + __entry->node_lock_seq = is_btree_node(path, level) + ? six_lock_seq(&path->l[level].b->c.lock) + : 0; + ), + -+ TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u", ++ TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_ids[__entry->btree_id], @@ -89630,6 +91894,10 @@ index 000000000..a743ab477 + __entry->pos_snapshot, + __entry->level, + __entry->node, ++ __entry->self_read_count, ++ __entry->self_intent_count, ++ __entry->read_count, ++ __entry->intent_count, + __entry->iter_lock_seq, + __entry->node_lock_seq) +); @@ -89671,7 +91939,7 @@ index 000000000..a743ab477 + __entry->self_intent_count = c.n[SIX_LOCK_intent]; + c = six_lock_counts(&path->l[level].b->c.lock); + __entry->read_count = c.n[SIX_LOCK_read]; -+ __entry->intent_count = c.n[SIX_LOCK_read]; ++ __entry->intent_count = c.n[SIX_LOCK_intent]; + __entry->iter_lock_seq = path->l[level].lock_seq; + __entry->node_lock_seq = is_btree_node(path, level) + ? six_lock_seq(&path->l[level].b->c.lock) @@ -90522,10 +92790,10 @@ index 000000000..905801772 +#endif /* _BCACHEFS_TWO_STATE_LOCK_H */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c new file mode 100644 -index 000000000..ae4f6de3c +index 000000000..636f1fa42 --- /dev/null +++ b/fs/bcachefs/util.c -@@ -0,0 +1,1137 @@ +@@ -0,0 +1,1144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * random utiility code, for bcache but in theory not specific to bcache @@ -90744,6 +93012,7 @@ index 000000000..ae4f6de3c + + while ((p = strsep(&s, ","))) { + int flag = match_string(list, -1, p); ++ + if (flag < 0) { + ret = -1; + break; @@ -90796,6 +93065,7 @@ index 000000000..ae4f6de3c + +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) +{ ++#ifdef CONFIG_STACKTRACE + unsigned nr_entries = 0; + int ret = 0; + @@ -90816,6 +93086,9 @@ index 000000000..ae4f6de3c + up_read(&task->signal->exec_update_lock); + + return ret; ++#else ++ return 0; ++#endif +} + +void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) @@ -91284,10 +93557,10 @@ index 000000000..ae4f6de3c + } +} + -+int bch2_bio_alloc_pages_noprof(struct bio *bio, size_t size, gfp_t gfp_mask) ++int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) +{ + while (size) { -+ struct page *page = alloc_pages_noprof(gfp_mask, 0); ++ struct page *page = alloc_pages(gfp_mask, 0); + unsigned len = min_t(size_t, PAGE_SIZE, size); + + if (!page) @@ -91325,9 +93598,10 @@ index 000000000..ae4f6de3c + struct bvec_iter iter; + + __bio_for_each_segment(bv, dst, iter, dst_iter) { -+ void *dstp = kmap_atomic(bv.bv_page); ++ void *dstp = kmap_local_page(bv.bv_page); ++ + memcpy(dstp + bv.bv_offset, src, bv.bv_len); -+ kunmap_atomic(dstp); ++ kunmap_local(dstp); + + src += bv.bv_len; + } @@ -91339,9 +93613,10 @@ index 000000000..ae4f6de3c + struct bvec_iter iter; + + __bio_for_each_segment(bv, src, iter, src_iter) { -+ void *srcp = kmap_atomic(bv.bv_page); ++ void *srcp = kmap_local_page(bv.bv_page); ++ + memcpy(dst, srcp + bv.bv_offset, bv.bv_len); -+ kunmap_atomic(srcp); ++ kunmap_local(srcp); + + dst += bv.bv_len; + } @@ -91665,10 +93940,10 @@ index 000000000..ae4f6de3c +} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h new file mode 100644 -index 000000000..5fa29dab3 +index 000000000..19cc6bfe9 --- /dev/null +++ b/fs/bcachefs/util.h -@@ -0,0 +1,846 @@ +@@ -0,0 +1,851 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_UTIL_H +#define _BCACHEFS_UTIL_H @@ -91731,13 +94006,12 @@ index 000000000..5fa29dab3 + free_pages((unsigned long) p, get_order(size)); +} + -+static inline void *vpmalloc_noprof(size_t size, gfp_t gfp_mask) ++static inline void *vpmalloc(size_t size, gfp_t gfp_mask) +{ -+ return (void *) get_free_pages_noprof(gfp_mask|__GFP_NOWARN, -+ get_order(size)) ?: -+ __vmalloc_noprof(size, gfp_mask); ++ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, ++ get_order(size)) ?: ++ __vmalloc(size, gfp_mask); +} -+#define vpmalloc(_size, _gfp) alloc_hooks(vpmalloc_noprof(_size, _gfp)) + +static inline void kvpfree(void *p, size_t size) +{ @@ -91747,13 +94021,12 @@ index 000000000..5fa29dab3 + vpfree(p, size); +} + -+static inline void *kvpmalloc_noprof(size_t size, gfp_t gfp_mask) ++static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) +{ + return size < PAGE_SIZE -+ ? kmalloc_noprof(size, gfp_mask) -+ : vpmalloc_noprof(size, gfp_mask); ++ ? kmalloc(size, gfp_mask) ++ : vpmalloc(size, gfp_mask); +} -+#define kvpmalloc(_size, _gfp) alloc_hooks(kvpmalloc_noprof(_size, _gfp)) + +int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); + @@ -92139,8 +94412,10 @@ index 000000000..5fa29dab3 + s64 last_change; + s64 last_target; + -+ /* If true, the rate will not increase if bch2_ratelimit_delay() -+ * is not being called often enough. */ ++ /* ++ * If true, the rate will not increase if bch2_ratelimit_delay() ++ * is not being called often enough. ++ */ + bool backpressure; +}; + @@ -92203,9 +94478,7 @@ index 000000000..5fa29dab3 +} + +void bch2_bio_map(struct bio *bio, void *base, size_t); -+int bch2_bio_alloc_pages_noprof(struct bio *, size_t, gfp_t); -+#define bch2_bio_alloc_pages(_bio, _size, _gfp) \ -+ alloc_hooks(bch2_bio_alloc_pages_noprof(_bio, _size, _gfp)) ++int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); + +static inline sector_t bdev_sectors(struct block_device *bdev) +{ @@ -92278,6 +94551,7 @@ index 000000000..5fa29dab3 +{ +#ifdef CONFIG_X86_64 + long d0, d1, d2; ++ + asm volatile("rep ; movsq" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (u64s), "1" (dst), "2" (src) @@ -92354,6 +94628,7 @@ index 000000000..5fa29dab3 + +#ifdef CONFIG_X86_64 + long d0, d1, d2; ++ + asm volatile("std ;\n" + "rep ; movsq\n" + "cld ;\n" @@ -92512,15 +94787,20 @@ index 000000000..5fa29dab3 + return cmp_int(l, r); +} + ++static inline int cmp_le32(__le32 l, __le32 r) ++{ ++ return cmp_int(le32_to_cpu(l), le32_to_cpu(r)); ++} ++ +#include + +#endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c new file mode 100644 -index 000000000..ef030fc02 +index 000000000..2a2ab86ed --- /dev/null +++ b/fs/bcachefs/varint.c -@@ -0,0 +1,122 @@ +@@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include @@ -92582,6 +94862,7 @@ index 000000000..ef030fc02 + + if (likely(bytes < 9)) { + __le64 v_le = 0; ++ + memcpy(&v_le, in, bytes); + v = le64_to_cpu(v_le); + v >>= bytes; @@ -92731,10 +95012,10 @@ index 000000000..53a694d71 +#endif /* _VSTRUCTS_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c new file mode 100644 -index 000000000..70f78006d +index 000000000..6f6b3caf0 --- /dev/null +++ b/fs/bcachefs/xattr.c -@@ -0,0 +1,648 @@ +@@ -0,0 +1,649 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -93231,7 +95512,8 @@ index 000000000..70f78006d + bool defined; +}; + -+static int inode_opt_set_fn(struct bch_inode_info *inode, ++static int inode_opt_set_fn(struct btree_trans *trans, ++ struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ @@ -93473,10 +95755,10 @@ index 52e6d5fda..dbdafa261 100644 } EXPORT_SYMBOL(d_tmpfile); diff --git a/fs/inode.c b/fs/inode.c -index b9d498032..6bb7646cb 100644 +index 67611a360..968931eb4 100644 --- a/fs/inode.c +++ b/fs/inode.c -@@ -57,8 +57,23 @@ +@@ -56,8 +56,23 @@ static unsigned int i_hash_mask __read_mostly; static unsigned int i_hash_shift __read_mostly; @@ -93502,7 +95784,7 @@ index b9d498032..6bb7646cb 100644 /* * Empty aops. Can be used for the cases where the user does not -@@ -417,7 +432,7 @@ EXPORT_SYMBOL(address_space_init_once); +@@ -416,7 +431,7 @@ EXPORT_SYMBOL(address_space_init_once); void inode_init_once(struct inode *inode) { memset(inode, 0, sizeof(*inode)); @@ -93511,7 +95793,7 @@ index b9d498032..6bb7646cb 100644 INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); -@@ -506,14 +521,15 @@ static inline void inode_sb_list_del(struct inode *inode) +@@ -505,14 +520,15 @@ static inline void inode_sb_list_del(struct inode *inode) } } @@ -93534,7 +95816,7 @@ index b9d498032..6bb7646cb 100644 } /** -@@ -526,13 +542,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) +@@ -525,13 +541,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) */ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { @@ -93552,7 +95834,7 @@ index b9d498032..6bb7646cb 100644 } EXPORT_SYMBOL(__insert_inode_hash); -@@ -544,11 +560,44 @@ EXPORT_SYMBOL(__insert_inode_hash); +@@ -543,11 +559,44 @@ EXPORT_SYMBOL(__insert_inode_hash); */ void __remove_inode_hash(struct inode *inode) { @@ -93602,7 +95884,7 @@ index b9d498032..6bb7646cb 100644 } EXPORT_SYMBOL(__remove_inode_hash); -@@ -897,26 +946,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) +@@ -896,26 +945,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) return freed; } @@ -93635,7 +95917,7 @@ index b9d498032..6bb7646cb 100644 goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { -@@ -935,19 +986,20 @@ static struct inode *find_inode(struct super_block *sb, +@@ -934,19 +985,20 @@ static struct inode *find_inode(struct super_block *sb, * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, @@ -93659,7 +95941,7 @@ index b9d498032..6bb7646cb 100644 goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { -@@ -1197,25 +1249,25 @@ EXPORT_SYMBOL(unlock_two_nondirectories); +@@ -1196,25 +1248,25 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * @@ -93691,7 +95973,7 @@ index b9d498032..6bb7646cb 100644 if (IS_ERR(old)) return NULL; wait_on_inode(old); -@@ -1237,7 +1289,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, +@@ -1236,7 +1288,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, */ spin_lock(&inode->i_lock); inode->i_state |= I_NEW; @@ -93700,7 +95982,7 @@ index b9d498032..6bb7646cb 100644 spin_unlock(&inode->i_lock); /* -@@ -1247,7 +1299,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, +@@ -1246,7 +1298,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, if (list_empty(&inode->i_sb_list)) inode_sb_list_add(inode); unlock: @@ -93709,7 +95991,7 @@ index b9d498032..6bb7646cb 100644 return inode; } -@@ -1308,12 +1360,12 @@ EXPORT_SYMBOL(iget5_locked); +@@ -1307,12 +1359,12 @@ EXPORT_SYMBOL(iget5_locked); */ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { @@ -93726,7 +96008,7 @@ index b9d498032..6bb7646cb 100644 if (inode) { if (IS_ERR(inode)) return NULL; -@@ -1329,17 +1381,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) +@@ -1328,17 +1380,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) if (inode) { struct inode *old; @@ -93748,7 +96030,7 @@ index b9d498032..6bb7646cb 100644 /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents -@@ -1352,7 +1404,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) +@@ -1351,7 +1403,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) * us. Use the old inode instead of the one we just * allocated. */ @@ -93757,7 +96039,7 @@ index b9d498032..6bb7646cb 100644 destroy_inode(inode); if (IS_ERR(old)) return NULL; -@@ -1376,10 +1428,11 @@ EXPORT_SYMBOL(iget_locked); +@@ -1375,10 +1427,11 @@ EXPORT_SYMBOL(iget_locked); */ static int test_inode_iunique(struct super_block *sb, unsigned long ino) { @@ -93771,7 +96053,7 @@ index b9d498032..6bb7646cb 100644 if (inode->i_ino == ino && inode->i_sb == sb) return 0; } -@@ -1463,12 +1516,12 @@ EXPORT_SYMBOL(igrab); +@@ -1462,12 +1515,12 @@ EXPORT_SYMBOL(igrab); struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { @@ -93788,7 +96070,7 @@ index b9d498032..6bb7646cb 100644 return IS_ERR(inode) ? NULL : inode; } -@@ -1518,12 +1571,12 @@ EXPORT_SYMBOL(ilookup5); +@@ -1517,12 +1570,12 @@ EXPORT_SYMBOL(ilookup5); */ struct inode *ilookup(struct super_block *sb, unsigned long ino) { @@ -93805,7 +96087,7 @@ index b9d498032..6bb7646cb 100644 if (inode) { if (IS_ERR(inode)) -@@ -1567,12 +1620,13 @@ struct inode *find_inode_nowait(struct super_block *sb, +@@ -1566,12 +1619,13 @@ struct inode *find_inode_nowait(struct super_block *sb, void *), void *data) { @@ -93822,7 +96104,7 @@ index b9d498032..6bb7646cb 100644 if (inode->i_sb != sb) continue; mval = match(inode, hashval, data); -@@ -1583,7 +1637,7 @@ struct inode *find_inode_nowait(struct super_block *sb, +@@ -1582,7 +1636,7 @@ struct inode *find_inode_nowait(struct super_block *sb, goto out; } out: @@ -93831,7 +96113,7 @@ index b9d498032..6bb7646cb 100644 return ret_inode; } EXPORT_SYMBOL(find_inode_nowait); -@@ -1612,13 +1666,14 @@ EXPORT_SYMBOL(find_inode_nowait); +@@ -1611,13 +1665,14 @@ EXPORT_SYMBOL(find_inode_nowait); struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { @@ -93848,7 +96130,7 @@ index b9d498032..6bb7646cb 100644 if (inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && test(inode, data)) -@@ -1650,13 +1705,14 @@ EXPORT_SYMBOL(find_inode_rcu); +@@ -1649,13 +1704,14 @@ EXPORT_SYMBOL(find_inode_rcu); struct inode *find_inode_by_ino_rcu(struct super_block *sb, unsigned long ino) { @@ -93865,7 +96147,7 @@ index b9d498032..6bb7646cb 100644 if (inode->i_ino == ino && inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) -@@ -1670,39 +1726,42 @@ int insert_inode_locked(struct inode *inode) +@@ -1669,39 +1725,42 @@ int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; @@ -93921,7 +96203,7 @@ index b9d498032..6bb7646cb 100644 wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); -@@ -2227,17 +2286,18 @@ EXPORT_SYMBOL(inode_needs_sync); +@@ -2226,17 +2285,18 @@ EXPORT_SYMBOL(inode_needs_sync); * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list * will DTRT. */ @@ -93943,7 +96225,7 @@ index b9d498032..6bb7646cb 100644 } static __initdata unsigned long ihash_entries; -@@ -2263,7 +2323,7 @@ void __init inode_init_early(void) +@@ -2262,7 +2322,7 @@ void __init inode_init_early(void) inode_hashtable = alloc_large_system_hash("Inode-cache", @@ -93952,7 +96234,7 @@ index b9d498032..6bb7646cb 100644 ihash_entries, 14, HASH_EARLY | HASH_ZERO, -@@ -2289,7 +2349,7 @@ void __init inode_init(void) +@@ -2288,7 +2348,7 @@ void __init inode_init(void) inode_hashtable = alloc_large_system_hash("Inode-cache", @@ -93962,7 +96244,7 @@ index b9d498032..6bb7646cb 100644 14, HASH_ZERO, diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c -index 063133ec7..13c40c09d 100644 +index aa8967cca..72d32603f 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -292,8 +292,12 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, @@ -94015,7 +96297,7 @@ index 063133ec7..13c40c09d 100644 bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ); bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); - bio_add_folio(&bio, folio, plen, poff); + bio_add_folio_nofail(&bio, folio, plen, poff); - return submit_bio_wait(&bio); + + if (iomap->flags & IOMAP_F_NOSUBMIT) @@ -94026,7 +96308,7 @@ index 063133ec7..13c40c09d 100644 } static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, -@@ -1486,7 +1503,10 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, +@@ -1489,7 +1506,10 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, return error; } @@ -94038,7 +96320,7 @@ index 063133ec7..13c40c09d 100644 return 0; } -@@ -1524,8 +1544,9 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, +@@ -1527,8 +1547,9 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, * traversal in iomap_finish_ioend(). */ static struct bio * @@ -94049,7 +96331,7 @@ index 063133ec7..13c40c09d 100644 struct bio *new; new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS); -@@ -1534,7 +1555,11 @@ iomap_chain_bio(struct bio *prev) +@@ -1537,7 +1558,11 @@ iomap_chain_bio(struct bio *prev) bio_chain(prev, new); bio_get(prev); /* for iomap_finish_ioend */ @@ -94062,80 +96344,15 @@ index 063133ec7..13c40c09d 100644 return new; } -@@ -1581,7 +1606,7 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, +@@ -1584,7 +1609,7 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, } if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) { - wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); + wpc->ioend->io_bio = iomap_chain_bio(wpc); - bio_add_folio(wpc->ioend->io_bio, folio, len, poff); + bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff); } -diff --git a/fs/super.c b/fs/super.c -index 04bc62ab7..a2decce02 100644 ---- a/fs/super.c -+++ b/fs/super.c -@@ -791,14 +791,7 @@ void iterate_supers_type(struct file_system_type *type, - - EXPORT_SYMBOL(iterate_supers_type); - --/** -- * get_super - get the superblock of a device -- * @bdev: device to get the superblock for -- * -- * Scans the superblock list and finds the superblock of the file system -- * mounted on the device given. %NULL is returned if no match is found. -- */ --struct super_block *get_super(struct block_device *bdev) -+static struct super_block *__get_super(struct block_device *bdev, bool try) - { - struct super_block *sb; - -@@ -813,7 +806,12 @@ struct super_block *get_super(struct block_device *bdev) - if (sb->s_bdev == bdev) { - sb->s_count++; - spin_unlock(&sb_lock); -- down_read(&sb->s_umount); -+ -+ if (!try) -+ down_read(&sb->s_umount); -+ else if (!down_read_trylock(&sb->s_umount)) -+ return NULL; -+ - /* still alive? */ - if (sb->s_root && (sb->s_flags & SB_BORN)) - return sb; -@@ -828,6 +826,30 @@ struct super_block *get_super(struct block_device *bdev) - return NULL; - } - -+/** -+ * get_super - get the superblock of a device -+ * @bdev: device to get the superblock for -+ * -+ * Scans the superblock list and finds the superblock of the file system -+ * mounted on the device given. %NULL is returned if no match is found. -+ */ -+struct super_block *get_super(struct block_device *bdev) -+{ -+ return __get_super(bdev, false); -+} -+ -+/** -+ * try_get_super - get the superblock of a device, using trylock on sb->s_umount -+ * @bdev: device to get the superblock for -+ * -+ * Scans the superblock list and finds the superblock of the file system -+ * mounted on the device given. %NULL is returned if no match is found. -+ */ -+struct super_block *try_get_super(struct block_device *bdev) -+{ -+ return __get_super(bdev, true); -+} -+ - /** - * get_active_super - get an active reference to the superblock of a device - * @bdev: device to get the superblock for diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 18c8f168b..f0003446f 100644 --- a/fs/xfs/xfs_iomap.c @@ -94151,7 +96368,7 @@ index 18c8f168b..f0003446f 100644 return xfs_alert_fsblock_zero(ip, imap); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h -index 6c09f8953..2733c5484 100644 +index e2866e7fa..29ecb5643 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -284,6 +284,7 @@ typedef struct xfs_mount { @@ -94171,7 +96388,7 @@ index 6c09f8953..2733c5484 100644 __XFS_HAS_FEAT(noalign, NOALIGN) __XFS_HAS_FEAT(allocsize, ALLOCSIZE) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c -index 4120bd1cb..83a0a043b 100644 +index 818510243..b6cdce43c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -121,7 +121,7 @@ enum { @@ -94191,7 +96408,7 @@ index 4120bd1cb..83a0a043b 100644 {} }; -@@ -1376,6 +1377,9 @@ xfs_fs_parse_param( +@@ -1396,6 +1397,9 @@ xfs_fs_parse_param( xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); parsing_mp->m_features |= XFS_FEAT_NOATTR2; return 0; @@ -94201,219 +96418,11 @@ index 4120bd1cb..83a0a043b 100644 default: xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); return -EINVAL; -diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h -new file mode 100644 -index 000000000..16fbf74ed ---- /dev/null -+++ b/include/asm-generic/codetag.lds.h -@@ -0,0 +1,15 @@ -+/* SPDX-License-Identifier: GPL-2.0-only */ -+#ifndef __ASM_GENERIC_CODETAG_LDS_H -+#define __ASM_GENERIC_CODETAG_LDS_H -+ -+#define SECTION_WITH_BOUNDARIES(_name) \ -+ . = ALIGN(8); \ -+ __start_##_name = .; \ -+ KEEP(*(_name)) \ -+ __stop_##_name = .; -+ -+#define CODETAG_SECTIONS() \ -+ SECTION_WITH_BOUNDARIES(alloc_tags) \ -+ SECTION_WITH_BOUNDARIES(dynamic_fault_tags) -+ -+#endif /* __ASM_GENERIC_CODETAG_LDS_H */ -diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h -index da9e5629e..47dd57ca7 100644 ---- a/include/asm-generic/vmlinux.lds.h -+++ b/include/asm-generic/vmlinux.lds.h -@@ -50,6 +50,8 @@ - * [__nosave_begin, __nosave_end] for the nosave data - */ - -+#include -+ - #ifndef LOAD_OFFSET - #define LOAD_OFFSET 0 - #endif -@@ -374,6 +376,7 @@ - . = ALIGN(8); \ - BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes) \ - BOUNDED_SECTION_BY(__dyndbg, ___dyndbg) \ -+ CODETAG_SECTIONS() \ - LIKELY_PROFILE() \ - BRANCH_PROFILE() \ - TRACE_PRINTKS() \ -diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h -new file mode 100644 -index 000000000..6c1b7e1dc ---- /dev/null -+++ b/include/linux/alloc_tag.h -@@ -0,0 +1,160 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * allocation tagging -+ */ -+#ifndef _LINUX_ALLOC_TAG_H -+#define _LINUX_ALLOC_TAG_H -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * An instance of this structure is created in a special ELF section at every -+ * allocation callsite. At runtime, the special section is treated as -+ * an array of these. Embedded codetag utilizes codetag framework. -+ */ -+struct alloc_tag { -+ struct codetag ct; -+ u64 __percpu *bytes_allocated; -+} __aligned(8); -+ -+#ifdef CONFIG_MEM_ALLOC_PROFILING -+ -+void alloc_tags_show_mem_report(struct seq_buf *s); -+ -+static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) -+{ -+ return container_of(ct, struct alloc_tag, ct); -+} -+ -+#define DEFINE_ALLOC_TAG(_alloc_tag, _old) \ -+ static struct alloc_tag _alloc_tag __used __aligned(8) \ -+ __section("alloc_tags") = { .ct = CODE_TAG_INIT }; \ -+ struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag) -+ -+DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, -+ mem_alloc_profiling_key); -+ -+static inline bool mem_alloc_profiling_enabled(void) -+{ -+ return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, -+ &mem_alloc_profiling_key); -+} -+ -+static inline u64 alloc_tag_read(struct alloc_tag *tag) -+{ -+ u64 v = 0; -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ v += *per_cpu_ptr(tag->bytes_allocated, cpu); -+ -+ return v; -+} -+ -+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG -+ -+#define CODETAG_EMPTY (void *)1 -+ -+static inline bool is_codetag_empty(union codetag_ref *ref) -+{ -+ return ref->ct == CODETAG_EMPTY; -+} -+ -+static inline void set_codetag_empty(union codetag_ref *ref) -+{ -+ if (ref) -+ ref->ct = CODETAG_EMPTY; -+} -+ -+#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ -+ -+static inline bool is_codetag_empty(union codetag_ref *ref) { return false; } -+static inline void set_codetag_empty(union codetag_ref *ref) {} -+ -+#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ -+ -+static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes) -+{ -+ struct alloc_tag *tag; -+ -+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG -+ WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); -+#endif -+ if (!ref || !ref->ct) -+ return; -+ -+ if (is_codetag_empty(ref)) { -+ ref->ct = NULL; -+ return; -+ } -+ -+ tag = ct_to_alloc_tag(ref->ct); -+ -+ this_cpu_add(*tag->bytes_allocated, -bytes); -+ ref->ct = NULL; -+} -+ -+static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) -+{ -+ __alloc_tag_sub(ref, bytes); -+} -+ -+static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) -+{ -+ __alloc_tag_sub(ref, bytes); -+} -+ -+static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) -+{ -+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG -+ WARN_ONCE(ref && ref->ct, -+ "alloc_tag was not cleared (got tag for %s:%u)\n",\ -+ ref->ct->filename, ref->ct->lineno); -+ -+ WARN_ONCE(!tag, "current->alloc_tag not set"); -+#endif -+ if (!ref || !tag) -+ return; -+ -+ ref->ct = &tag->ct; -+ this_cpu_add(*tag->bytes_allocated, bytes); -+} -+ -+#else -+ -+#define DEFINE_ALLOC_TAG(_alloc_tag, _old) -+static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} -+static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {} -+static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, -+ size_t bytes) {} -+static inline void set_codetag_empty(union codetag_ref *ref) {} -+ -+#endif -+ -+typedef struct mempool_s mempool_t; -+ -+#define res_type_to_err(_res) _Generic((_res), \ -+ struct folio *: NULL, \ -+ struct page *: NULL, \ -+ mempool_t *: NULL, \ -+ void *: NULL, \ -+ unsigned long: 0, \ -+ int: -ENOMEM) -+ -+#define alloc_hooks(_do_alloc) \ -+({ \ -+ typeof(_do_alloc) _res; \ -+ DEFINE_ALLOC_TAG(_alloc_tag, _old); \ -+ \ -+ _res = !memory_fault() ? _do_alloc : res_type_to_err(_res); \ -+ alloc_tag_restore(&_alloc_tag, _old); \ -+ _res; \ -+}) -+ -+#endif /* _LINUX_ALLOC_TAG_H */ diff --git a/include/linux/bio.h b/include/linux/bio.h -index b3e7529ff..f2620f8d1 100644 +index 11984ed29..debbd8fcb 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h -@@ -484,7 +484,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, +@@ -488,7 +488,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); void guard_bio_eod(struct bio *bio); @@ -94428,10 +96437,10 @@ index b3e7529ff..f2620f8d1 100644 static inline void bio_release_pages(struct bio *bio, bool mark_dirty) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 67e942d77..10d30c0bc 100644 +index 87d94be78..61ffaaba4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h -@@ -855,6 +855,7 @@ extern const char *blk_op_str(enum req_op op); +@@ -846,6 +846,7 @@ extern const char *blk_op_str(enum req_op op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); @@ -94567,122 +96576,6 @@ index c88cdc4ae..722a586bb 100644 +} while (0) + #endif /* _LINUX_CLOSURE_H */ -diff --git a/include/linux/codetag.h b/include/linux/codetag.h -new file mode 100644 -index 000000000..87207f199 ---- /dev/null -+++ b/include/linux/codetag.h -@@ -0,0 +1,110 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * code tagging framework -+ */ -+#ifndef _LINUX_CODETAG_H -+#define _LINUX_CODETAG_H -+ -+#include -+ -+struct codetag_iterator; -+struct codetag_type; -+struct seq_buf; -+struct module; -+ -+/* -+ * An instance of this structure is created in a special ELF section at every -+ * code location being tagged. At runtime, the special section is treated as -+ * an array of these. -+ */ -+struct codetag { -+ unsigned int flags; /* used in later patches */ -+ unsigned int lineno; -+ const char *modname; -+ const char *function; -+ const char *filename; -+} __aligned(8); -+ -+union codetag_ref { -+ struct codetag *ct; -+}; -+ -+struct codetag_range { -+ struct codetag *start; -+ struct codetag *stop; -+}; -+ -+struct codetag_module { -+ struct module *mod; -+ struct codetag_range range; -+}; -+ -+struct codetag_type_desc { -+ const char *section; -+ size_t tag_size; -+ void (*module_load)(struct codetag_type *cttype, -+ struct codetag_module *cmod); -+ bool (*module_unload)(struct codetag_type *cttype, -+ struct codetag_module *cmod); -+}; -+ -+struct codetag_iterator { -+ struct codetag_type *cttype; -+ struct codetag_module *cmod; -+ unsigned long mod_id; -+ struct codetag *ct; -+}; -+ -+#define CODE_TAG_INIT { \ -+ .modname = KBUILD_MODNAME, \ -+ .function = __func__, \ -+ .filename = __FILE__, \ -+ .lineno = __LINE__, \ -+ .flags = 0, \ -+} -+ -+void codetag_lock_module_list(struct codetag_type *cttype, bool lock); -+struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype); -+struct codetag *codetag_next_ct(struct codetag_iterator *iter); -+ -+void codetag_to_text(struct seq_buf *out, struct codetag *ct); -+ -+struct codetag_type * -+codetag_register_type(const struct codetag_type_desc *desc); -+ -+#ifdef CONFIG_CODE_TAGGING -+void codetag_load_module(struct module *mod); -+bool codetag_unload_module(struct module *mod); -+#else -+static inline void codetag_load_module(struct module *mod) {} -+static inline bool codetag_unload_module(struct module *mod) { return true; } -+#endif -+ -+/* Codetag query parsing */ -+ -+struct codetag_query { -+ const char *filename; -+ const char *module; -+ const char *function; -+ const char *class; -+ unsigned int first_line, last_line; -+ unsigned int first_index, last_index; -+ unsigned int cur_index; -+ -+ bool match_line:1; -+ bool match_index:1; -+ -+ unsigned int set_enabled:1; -+ unsigned int enabled:2; -+ -+ unsigned int set_frequency:1; -+ unsigned int frequency; -+}; -+ -+char *codetag_query_parse(struct codetag_query *q, char *buf); -+bool codetag_matches_query(struct codetag_query *q, -+ const struct codetag *ct, -+ const struct codetag_module *mod, -+ const char *class); -+ -+#endif /* _LINUX_CODETAG_H */ diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 6b351e009..3da2f0545 100644 --- a/include/linux/dcache.h @@ -94695,106 +96588,8 @@ index 6b351e009..3da2f0545 100644 extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); -diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h -index 31f114f48..d741940dc 100644 ---- a/include/linux/dma-map-ops.h -+++ b/include/linux/dma-map-ops.h -@@ -27,7 +27,7 @@ struct dma_map_ops { - unsigned long attrs); - void (*free)(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, unsigned long attrs); -- struct page *(*alloc_pages)(struct device *dev, size_t size, -+ struct page *(*alloc_pages_op)(struct device *dev, size_t size, - dma_addr_t *dma_handle, enum dma_data_direction dir, - gfp_t gfp); - void (*free_pages)(struct device *dev, size_t size, struct page *vaddr, -diff --git a/include/linux/dynamic_fault.h b/include/linux/dynamic_fault.h -new file mode 100644 -index 000000000..526a33209 ---- /dev/null -+++ b/include/linux/dynamic_fault.h -@@ -0,0 +1,79 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+ -+#ifndef _LINUX_DYNAMIC_FAULT_H -+#define _LINUX_DYNAMIC_FAULT_H -+ -+/* -+ * Dynamic/code tagging fault injection: -+ * -+ * Originally based on the dynamic debug trick of putting types in a special elf -+ * section, then rewritten using code tagging: -+ * -+ * To use, simply insert a call to dynamic_fault("fault_class"), which will -+ * return true if an error should be injected. -+ * -+ * Fault injection sites may be listed and enabled via debugfs, under -+ * /sys/kernel/debug/dynamic_faults. -+ */ -+ -+#ifdef CONFIG_CODETAG_FAULT_INJECTION -+ -+#include -+#include -+ -+#define DFAULT_STATES() \ -+ x(disabled) \ -+ x(enabled) \ -+ x(oneshot) -+ -+enum dfault_enabled { -+#define x(n) DFAULT_##n, -+ DFAULT_STATES() -+#undef x -+}; -+ -+union dfault_state { -+ struct { -+ unsigned int enabled:2; -+ unsigned int count:30; -+ }; -+ -+ struct { -+ unsigned int v; -+ }; -+}; -+ -+struct dfault { -+ struct codetag tag; -+ const char *class; -+ unsigned int frequency; -+ union dfault_state state; -+ struct static_key_false enabled; -+}; -+ -+bool __dynamic_fault_enabled(struct dfault *df); -+ -+#define dynamic_fault(_class) \ -+({ \ -+ static struct dfault \ -+ __used \ -+ __section("dynamic_fault_tags") \ -+ __aligned(8) df = { \ -+ .tag = CODE_TAG_INIT, \ -+ .class = _class, \ -+ .enabled = STATIC_KEY_FALSE_INIT, \ -+ }; \ -+ \ -+ static_key_false(&df.enabled.key) && \ -+ __dynamic_fault_enabled(&df); \ -+}) -+ -+#else -+ -+#define dynamic_fault(_class) false -+ -+#endif /* CODETAG_FAULT_INJECTION */ -+ -+#define memory_fault() dynamic_fault("memory") -+ -+#endif /* _LINUX_DYNAMIC_FAULT_H */ diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h -index 9edb29101..4bf7c8466 100644 +index 11fbd0ee1..f49a7d311 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -98,6 +98,12 @@ enum fid_type { @@ -94810,35 +96605,11 @@ index 9edb29101..4bf7c8466 100644 /* * 128 bit child FID (struct lu_fid) * 128 bit parent FID (struct lu_fid) -diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h -index c9de1f59e..6f36fff09 100644 ---- a/include/linux/fortify-string.h -+++ b/include/linux/fortify-string.h -@@ -689,9 +689,9 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) - return __real_memchr_inv(p, c, size); - } - --extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup) -+extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup_noprof) - __realloc_size(2); --__FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp) -+__FORTIFY_INLINE void *kmemdup_noprof(const void * const POS0 p, size_t size, gfp_t gfp) - { - size_t p_size = __struct_size(p); - -@@ -701,6 +701,7 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp - fortify_panic(__func__); - return __real_kmemdup(p, size, gfp); - } -+#define kmemdup(...) alloc_hooks(kmemdup_noprof(__VA_ARGS__)) - - /** - * strcpy - Copy a string into another string buffer diff --git a/include/linux/fs.h b/include/linux/fs.h -index 133f0640f..f04872975 100644 +index 562f2623c..810fa0812 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h -@@ -664,7 +664,8 @@ struct inode { +@@ -660,7 +660,8 @@ struct inode { unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_time_when; @@ -94848,7 +96619,7 @@ index 133f0640f..f04872975 100644 struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ -@@ -730,7 +731,7 @@ static inline unsigned int i_blocksize(const struct inode *node) +@@ -726,7 +727,7 @@ static inline unsigned int i_blocksize(const struct inode *node) static inline int inode_unhashed(struct inode *inode) { @@ -94857,7 +96628,7 @@ index 133f0640f..f04872975 100644 } /* -@@ -741,7 +742,7 @@ static inline int inode_unhashed(struct inode *inode) +@@ -737,7 +738,7 @@ static inline int inode_unhashed(struct inode *inode) */ static inline void inode_fake_hash(struct inode *inode) { @@ -94866,7 +96637,7 @@ index 133f0640f..f04872975 100644 } /* -@@ -2699,11 +2700,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap, +@@ -2729,11 +2730,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap, * This must be used for allocating filesystems specific inodes to set * up the inode reclaim context correctly. */ @@ -94879,7 +96650,7 @@ index 133f0640f..f04872975 100644 extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) -@@ -2714,7 +2711,7 @@ static inline void insert_inode_hash(struct inode *inode) +@@ -2744,7 +2741,7 @@ static inline void insert_inode_hash(struct inode *inode) extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { @@ -94888,14 +96659,6 @@ index 133f0640f..f04872975 100644 __remove_inode_hash(inode); } -@@ -2897,6 +2894,7 @@ extern struct file_system_type *get_filesystem(struct file_system_type *fs); - extern void put_filesystem(struct file_system_type *fs); - extern struct file_system_type *get_fs_type(const char *name); - extern struct super_block *get_super(struct block_device *); -+extern struct super_block *try_get_super(struct block_device *); - extern struct super_block *get_active_super(struct block_device *bdev); - extern void drop_super(struct super_block *sb); - extern void drop_super_exclusive(struct super_block *sb); diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index 107613f7d..c74b73769 100644 --- a/include/linux/generic-radix-tree.h @@ -95007,220 +96770,11 @@ index 107613f7d..c74b73769 100644 int __genradix_prealloc(struct __genradix *, size_t, gfp_t); /** -diff --git a/include/linux/gfp.h b/include/linux/gfp.h -index ed8cb537c..495745c99 100644 ---- a/include/linux/gfp.h -+++ b/include/linux/gfp.h -@@ -6,6 +6,8 @@ - - #include - #include -+#include -+#include - - struct vm_area_struct; - -@@ -174,42 +176,43 @@ static inline void arch_free_page(struct page *page, int order) { } - static inline void arch_alloc_page(struct page *page, int order) { } - #endif - --struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, -+struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid, - nodemask_t *nodemask); --struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, -+#define __alloc_pages(...) alloc_hooks(__alloc_pages_noprof(__VA_ARGS__)) -+ -+struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, - nodemask_t *nodemask); -+#define __folio_alloc(...) alloc_hooks(__folio_alloc_noprof(__VA_ARGS__)) - --unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, -+unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, - nodemask_t *nodemask, int nr_pages, - struct list_head *page_list, - struct page **page_array); -+#define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) - --unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, -+unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, - unsigned long nr_pages, - struct page **page_array); -+#define alloc_pages_bulk_array_mempolicy(...) alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__)) - - /* Bulk allocate order-0 pages */ --static inline unsigned long --alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list) --{ -- return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL); --} -+#define alloc_pages_bulk_list(_gfp, _nr_pages, _list) \ -+ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL) - --static inline unsigned long --alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array) --{ -- return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array); --} -+#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array) \ -+ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array) - - static inline unsigned long --alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) -+alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) - { - if (nid == NUMA_NO_NODE) - nid = numa_mem_id(); - -- return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array); -+ return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array); - } - -+#define alloc_pages_bulk_array_node(...) alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__)) -+ - static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) - { - gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN); -@@ -229,21 +232,23 @@ static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) - * online. For more general interface, see alloc_pages_node(). - */ - static inline struct page * --__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) -+__alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order) - { - VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); - warn_if_node_offline(nid, gfp_mask); - -- return __alloc_pages(gfp_mask, order, nid, NULL); -+ return __alloc_pages_noprof(gfp_mask, order, nid, NULL); - } - -+#define __alloc_pages_node(...) alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__)) -+ - static inline - struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid) - { - VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); - warn_if_node_offline(nid, gfp); - -- return __folio_alloc(gfp, order, nid, NULL); -+ return __folio_alloc_noprof(gfp, order, nid, NULL); - } - - /* -@@ -251,53 +256,69 @@ struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid) - * prefer the current CPU's closest node. Otherwise node must be valid and - * online. - */ --static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, -- unsigned int order) -+static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask, -+ unsigned int order) - { - if (nid == NUMA_NO_NODE) - nid = numa_mem_id(); - -- return __alloc_pages_node(nid, gfp_mask, order); -+ return __alloc_pages_node_noprof(nid, gfp_mask, order); - } - -+#define alloc_pages_node(...) alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__)) -+ - #ifdef CONFIG_NUMA --struct page *alloc_pages(gfp_t gfp, unsigned int order); --struct folio *folio_alloc(gfp_t gfp, unsigned order); --struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, -+struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); -+struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); -+struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage); - #else --static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) -+static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order) - { -- return alloc_pages_node(numa_node_id(), gfp_mask, order); -+ return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order); - } --static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) -+static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) - { - return __folio_alloc_node(gfp, order, numa_node_id()); - } --#define vma_alloc_folio(gfp, order, vma, addr, hugepage) \ -- folio_alloc(gfp, order) -+#define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage) \ -+ folio_alloc_noprof(gfp, order) - #endif -+ -+#define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) -+#define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) -+#define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) -+ - #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) --static inline struct page *alloc_page_vma(gfp_t gfp, -+ -+static inline struct page *alloc_page_vma_noprof(gfp_t gfp, - struct vm_area_struct *vma, unsigned long addr) - { -- struct folio *folio = vma_alloc_folio(gfp, 0, vma, addr, false); -+ struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr, false); - - return &folio->page; - } -+#define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) -+ -+extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); -+#define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__)) - --extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); --extern unsigned long get_zeroed_page(gfp_t gfp_mask); -+extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask); -+#define get_zeroed_page(...) alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__)) -+ -+void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1); -+#define alloc_pages_exact(...) alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__)) - --void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1); - void free_pages_exact(void *virt, size_t size); --__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); - --#define __get_free_page(gfp_mask) \ -- __get_free_pages((gfp_mask), 0) -+__meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); -+#define alloc_pages_exact_nid(...) alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__)) -+ -+#define __get_free_page(gfp_mask) \ -+ __get_free_pages((gfp_mask), 0) - --#define __get_dma_pages(gfp_mask, order) \ -- __get_free_pages((gfp_mask) | GFP_DMA, (order)) -+#define __get_dma_pages(gfp_mask, order) \ -+ __get_free_pages((gfp_mask) | GFP_DMA, (order)) - - extern void __free_pages(struct page *page, unsigned int order); - extern void free_pages(unsigned long addr, unsigned int order); -@@ -354,10 +375,14 @@ static inline bool pm_suspended_storage(void) - - #ifdef CONFIG_CONTIG_ALLOC - /* The below functions must be run on a range from a single zone. */ --extern int alloc_contig_range(unsigned long start, unsigned long end, -+extern int alloc_contig_range_noprof(unsigned long start, unsigned long end, - unsigned migratetype, gfp_t gfp_mask); --extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, -- int nid, nodemask_t *nodemask); -+#define alloc_contig_range(...) alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__)) -+ -+extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, -+ int nid, nodemask_t *nodemask); -+#define alloc_contig_pages(...) alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) -+ - #endif - void free_contig_range(unsigned long pfn, unsigned long nr_pages); - diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h -index 6583a5867..1c6573d69 100644 +index 6583a5867..3fbe62476 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h -@@ -21,44 +21,86 @@ typedef unsigned int __bitwise gfp_t; +@@ -21,44 +21,78 @@ typedef unsigned int __bitwise gfp_t; * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c */ @@ -95256,9 +96810,6 @@ index 6583a5867..1c6573d69 100644 +#ifdef CONFIG_LOCKDEP + ___GFP_NOLOCKDEP_BIT, +#endif -+#ifdef CONFIG_SLAB_OBJ_EXT -+ ___GFP_NO_OBJ_EXT_BIT, -+#endif + ___GFP_LAST_BIT +}; + @@ -95326,31 +96877,10 @@ index 6583a5867..1c6573d69 100644 #define ___GFP_NOLOCKDEP 0 #endif -/* If the above are modified, __GFP_BITS_SHIFT may need updating */ -+#ifdef CONFIG_SLAB_OBJ_EXT -+#define ___GFP_NO_OBJ_EXT BIT(___GFP_NO_OBJ_EXT_BIT) -+#else -+#define ___GFP_NO_OBJ_EXT 0 -+#endif /* * Physical address zone modifiers (see linux/mmzone.h - low four bits) -@@ -99,12 +141,15 @@ typedef unsigned int __bitwise gfp_t; - * node with no fallbacks or placement policy enforcements. - * - * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg. -+ * -+ * %__GFP_NO_OBJ_EXT causes slab allocation to have no object extension. - */ - #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) - #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) - #define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) - #define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE) - #define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT) -+#define __GFP_NO_OBJ_EXT ((__force gfp_t)___GFP_NO_OBJ_EXT) - - /** - * DOC: Watermark modifiers -@@ -249,7 +294,7 @@ typedef unsigned int __bitwise gfp_t; +@@ -249,7 +283,7 @@ typedef unsigned int __bitwise gfp_t; #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ @@ -95359,19 +96889,6 @@ index 6583a5867..1c6573d69 100644 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /** -diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h -index 0ee140176..e67349e84 100644 ---- a/include/linux/hrtimer.h -+++ b/include/linux/hrtimer.h -@@ -16,7 +16,7 @@ - #include - #include - #include --#include -+#include - #include - #include - #include diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e2b836c2e..a774d074b 100644 --- a/include/linux/iomap.h @@ -95418,7 +96935,7 @@ index ae1b54144..8ee2bf5af 100644 { bit_spin_lock(0, (unsigned long *)b); diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h -index 74bd269a8..3bb30499d 100644 +index 310f85903..2fdfd9129 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -344,6 +344,8 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); @@ -95439,7 +96956,7 @@ index 74bd269a8..3bb30499d 100644 #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map __maybe_unused _name = {} -@@ -681,4 +685,10 @@ lockdep_rcu_suspicious(const char *file, const int line, const char *s) +@@ -689,4 +693,10 @@ lockdep_rcu_suspicious(const char *file, const int line, const char *s) } #endif @@ -95451,10 +96968,10 @@ index 74bd269a8..3bb30499d 100644 + #endif /* __LINUX_LOCKDEP_H */ diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h -index 59f4fb162..f90c779e4 100644 +index 2ebc323d3..aa6bddac2 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h -@@ -129,7 +129,7 @@ struct lock_class { +@@ -137,7 +137,7 @@ struct lock_class { u8 wait_type_inner; u8 wait_type_outer; u8 lock_type; @@ -95667,306 +97184,8 @@ index 000000000..647505010 +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); + +#endif // MEAN_AND_VAIRANCE_H_ -diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h -index 222d73701..3eb8975c1 100644 ---- a/include/linux/memcontrol.h -+++ b/include/linux/memcontrol.h -@@ -339,15 +339,32 @@ struct mem_cgroup { - extern struct mem_cgroup *root_mem_cgroup; - - enum page_memcg_data_flags { -- /* page->memcg_data is a pointer to an objcgs vector */ -- MEMCG_DATA_OBJCGS = (1UL << 0), -+ /* page->memcg_data is a pointer to an slabobj_ext vector */ -+ MEMCG_DATA_OBJEXTS = (1UL << 0), - /* page has been accounted as a non-slab kernel page */ - MEMCG_DATA_KMEM = (1UL << 1), - /* the next bit after the last actual flag */ - __NR_MEMCG_DATA_FLAGS = (1UL << 2), - }; - --#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) -+#define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS -+ -+#else /* CONFIG_MEMCG */ -+ -+#define __FIRST_OBJEXT_FLAG (1UL << 0) -+ -+#endif /* CONFIG_MEMCG */ -+ -+enum objext_flags { -+ /* slabobj_ext vector failed to allocate */ -+ OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG, -+ /* the next bit after the last actual flag */ -+ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), -+}; -+ -+#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1) -+ -+#ifdef CONFIG_MEMCG - - static inline bool folio_memcg_kmem(struct folio *folio); - -@@ -378,10 +395,10 @@ static inline struct mem_cgroup *__folio_memcg(struct folio *folio) - unsigned long memcg_data = folio->memcg_data; - - VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); -- VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio); -+ VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); - VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); - -- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); -+ return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - } - - /* -@@ -399,10 +416,10 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) - unsigned long memcg_data = folio->memcg_data; - - VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); -- VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio); -+ VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); - VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); - -- return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); -+ return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - } - - /* -@@ -459,11 +476,11 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) - if (memcg_data & MEMCG_DATA_KMEM) { - struct obj_cgroup *objcg; - -- objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); -+ objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - return obj_cgroup_memcg(objcg); - } - -- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); -+ return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - } - - /* -@@ -496,17 +513,17 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) - */ - unsigned long memcg_data = READ_ONCE(folio->memcg_data); - -- if (memcg_data & MEMCG_DATA_OBJCGS) -+ if (memcg_data & MEMCG_DATA_OBJEXTS) - return NULL; - - if (memcg_data & MEMCG_DATA_KMEM) { - struct obj_cgroup *objcg; - -- objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); -+ objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - return obj_cgroup_memcg(objcg); - } - -- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); -+ return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - } - - static inline struct mem_cgroup *page_memcg_check(struct page *page) -@@ -542,7 +559,7 @@ static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *ob - static inline bool folio_memcg_kmem(struct folio *folio) - { - VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page); -- VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio); -+ VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio); - return folio->memcg_data & MEMCG_DATA_KMEM; - } - -@@ -1606,6 +1623,19 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, - } - #endif /* CONFIG_MEMCG */ - -+/* -+ * Extended information for slab objects stored as an array in page->memcg_data -+ * if MEMCG_DATA_OBJEXTS is set. -+ */ -+struct slabobj_ext { -+#ifdef CONFIG_MEMCG_KMEM -+ struct obj_cgroup *objcg; -+#endif -+#ifdef CONFIG_MEM_ALLOC_PROFILING -+ union codetag_ref ref; -+#endif -+} __aligned(8); -+ - static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) - { - __mod_lruvec_kmem_state(p, idx, 1); -diff --git a/include/linux/mempool.h b/include/linux/mempool.h -index 4aae6c06c..9fa126aa1 100644 ---- a/include/linux/mempool.h -+++ b/include/linux/mempool.h -@@ -5,6 +5,8 @@ - #ifndef _LINUX_MEMPOOL_H - #define _LINUX_MEMPOOL_H - -+#include -+#include - #include - #include - -@@ -39,18 +41,32 @@ void mempool_exit(mempool_t *pool); - int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id); --int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, -+ -+int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data); -+#define mempool_init(...) \ -+ alloc_hooks(mempool_init_noprof(__VA_ARGS__)) - - extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data); --extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, -+ -+extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int nid); -+#define mempool_create_node(...) \ -+ alloc_hooks(mempool_create_node_noprof(__VA_ARGS__)) -+ -+#define mempool_create(_min_nr, _alloc_fn, _free_fn, _pool_data) \ -+ mempool_create_node(_min_nr, _alloc_fn, _free_fn, _pool_data, \ -+ GFP_KERNEL, NUMA_NO_NODE) - - extern int mempool_resize(mempool_t *pool, int new_min_nr); - extern void mempool_destroy(mempool_t *pool); --extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc; -+ -+extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc; -+#define mempool_alloc(...) \ -+ alloc_hooks(mempool_alloc_noprof(__VA_ARGS__)) -+ - extern void mempool_free(void *element, mempool_t *pool); - - /* -@@ -61,19 +77,10 @@ extern void mempool_free(void *element, mempool_t *pool); - void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data); - void mempool_free_slab(void *element, void *pool_data); - --static inline int --mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc) --{ -- return mempool_init(pool, min_nr, mempool_alloc_slab, -- mempool_free_slab, (void *) kc); --} -- --static inline mempool_t * --mempool_create_slab_pool(int min_nr, struct kmem_cache *kc) --{ -- return mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab, -- (void *) kc); --} -+#define mempool_init_slab_pool(_pool, _min_nr, _kc) \ -+ mempool_init(_pool, (_min_nr), mempool_alloc_slab, mempool_free_slab, (void *)(_kc)) -+#define mempool_create_slab_pool(_min_nr, _kc) \ -+ mempool_create((_min_nr), mempool_alloc_slab, mempool_free_slab, (void *)(_kc)) - - /* - * a mempool_alloc_t and a mempool_free_t to kmalloc and kfree the -@@ -82,17 +89,12 @@ mempool_create_slab_pool(int min_nr, struct kmem_cache *kc) - void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data); - void mempool_kfree(void *element, void *pool_data); - --static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size) --{ -- return mempool_init(pool, min_nr, mempool_kmalloc, -- mempool_kfree, (void *) size); --} -- --static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) --{ -- return mempool_create(min_nr, mempool_kmalloc, mempool_kfree, -- (void *) size); --} -+#define mempool_init_kmalloc_pool(_pool, _min_nr, _size) \ -+ mempool_init(_pool, (_min_nr), mempool_kmalloc, mempool_kfree, \ -+ (void *)(unsigned long)(_size)) -+#define mempool_create_kmalloc_pool(_min_nr, _size) \ -+ mempool_create((_min_nr), mempool_kmalloc, mempool_kfree, \ -+ (void *)(unsigned long)(_size)) - - /* - * A mempool_alloc_t and mempool_free_t for a simple page allocator that -@@ -101,16 +103,11 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) - void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data); - void mempool_free_pages(void *element, void *pool_data); - --static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order) --{ -- return mempool_init(pool, min_nr, mempool_alloc_pages, -- mempool_free_pages, (void *)(long)order); --} -- --static inline mempool_t *mempool_create_page_pool(int min_nr, int order) --{ -- return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages, -- (void *)(long)order); --} -+#define mempool_init_page_pool(_pool, _min_nr, _order) \ -+ mempool_init(_pool, (_min_nr), mempool_alloc_pages, \ -+ mempool_free_pages, (void *)(long)(_order)) -+#define mempool_create_page_pool(_min_nr, _order) \ -+ mempool_create((_min_nr), mempool_alloc_pages, \ -+ mempool_free_pages, (void *)(long)(_order)) - - #endif /* _LINUX_MEMPOOL_H */ -diff --git a/include/linux/mm.h b/include/linux/mm.h -index 3c6c4c836..88b45fb4f 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -5,6 +5,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -2925,6 +2926,13 @@ extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); - /* Free the reserved page into the buddy system, so it gets managed. */ - static inline void free_reserved_page(struct page *page) - { -+ union codetag_ref *ref; -+ -+ ref = get_page_tag_ref(page); -+ if (ref) { -+ set_codetag_empty(ref); -+ put_page_tag_ref(ref); -+ } - ClearPageReserved(page); - init_page_count(page); - __free_page(page); -diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index de10fc797..888b87b3c 100644 ---- a/include/linux/mm_types.h -+++ b/include/linux/mm_types.h -@@ -194,7 +194,7 @@ struct page { - /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ - atomic_t _refcount; - --#ifdef CONFIG_MEMCG -+#ifdef CONFIG_SLAB_OBJ_EXT - unsigned long memcg_data; - #endif - -@@ -320,7 +320,7 @@ struct folio { - void *private; - atomic_t _mapcount; - atomic_t _refcount; --#ifdef CONFIG_MEMCG -+#ifdef CONFIG_SLAB_OBJ_EXT - unsigned long memcg_data; - #endif - /* private: the union with struct page is transitional */ diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h -index bb0ee8052..fda37b6df 100644 +index 8d07116ca..b61438313 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -93,10 +93,10 @@ @@ -95996,211 +97215,6 @@ index 000000000..84c2f47c4 +typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; + +#endif /* __LINUX_NODEMASK_TYPES_H */ -diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h -index 67314f648..cff15ee54 100644 ---- a/include/linux/page_ext.h -+++ b/include/linux/page_ext.h -@@ -4,7 +4,6 @@ - - #include - #include --#include - - struct pglist_data; - -diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h -index 08328b579..347ba7f86 100644 ---- a/include/linux/pagemap.h -+++ b/include/linux/pagemap.h -@@ -467,14 +467,17 @@ static inline void *detach_page_private(struct page *page) - } - - #ifdef CONFIG_NUMA --struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); -+struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order); - #else --static inline struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) -+static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) - { -- return folio_alloc(gfp, order); -+ return folio_alloc_noprof(gfp, order); - } - #endif - -+#define filemap_alloc_folio(...) \ -+ alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__)) -+ - static inline struct page *__page_cache_alloc(gfp_t gfp) - { - return &filemap_alloc_folio(gfp, 0)->page; -diff --git a/include/linux/percpu.h b/include/linux/percpu.h -index 1338ea2aa..dc50dedb0 100644 ---- a/include/linux/percpu.h -+++ b/include/linux/percpu.h -@@ -2,12 +2,14 @@ - #ifndef __LINUX_PERCPU_H - #define __LINUX_PERCPU_H - -+#include - #include - #include - #include - #include - #include - #include -+#include - - #include - -@@ -116,7 +118,6 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); - #endif - --extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1); - extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); - extern bool is_kernel_percpu_address(unsigned long addr); - -@@ -124,10 +125,15 @@ extern bool is_kernel_percpu_address(unsigned long addr); - extern void __init setup_per_cpu_areas(void); - #endif - --extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1); --extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1); --extern void free_percpu(void __percpu *__pdata); --extern phys_addr_t per_cpu_ptr_to_phys(void *addr); -+extern void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, -+ gfp_t gfp) __alloc_size(1); -+ -+#define __alloc_percpu_gfp(_size, _align, _gfp) \ -+ alloc_hooks(pcpu_alloc_noprof(_size, _align, false, _gfp)) -+#define __alloc_percpu(_size, _align) \ -+ alloc_hooks(pcpu_alloc_noprof(_size, _align, false, GFP_KERNEL)) -+#define __alloc_reserved_percpu(_size, _align) \ -+ alloc_hooks(pcpu_alloc_noprof(_size, _align, true, GFP_KERNEL)) - - #define alloc_percpu_gfp(type, gfp) \ - (typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \ -@@ -136,6 +142,9 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr); - (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ - __alignof__(type)) - -+extern void free_percpu(void __percpu *__pdata); -+extern phys_addr_t per_cpu_ptr_to_phys(void *addr); -+ - extern unsigned long pcpu_nr_pages(void); - - #endif /* __LINUX_PERCPU_H */ -diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h -new file mode 100644 -index 000000000..ae9b0f359 ---- /dev/null -+++ b/include/linux/pgalloc_tag.h -@@ -0,0 +1,105 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * page allocation tagging -+ */ -+#ifndef _LINUX_PGALLOC_TAG_H -+#define _LINUX_PGALLOC_TAG_H -+ -+#include -+ -+#ifdef CONFIG_MEM_ALLOC_PROFILING -+ -+#include -+ -+extern struct page_ext_operations page_alloc_tagging_ops; -+extern struct page_ext *page_ext_get(struct page *page); -+extern void page_ext_put(struct page_ext *page_ext); -+ -+static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext) -+{ -+ return (void *)page_ext + page_alloc_tagging_ops.offset; -+} -+ -+static inline struct page_ext *page_ext_from_codetag_ref(union codetag_ref *ref) -+{ -+ return (void *)ref - page_alloc_tagging_ops.offset; -+} -+ -+static inline union codetag_ref *get_page_tag_ref(struct page *page) -+{ -+ if (page && mem_alloc_profiling_enabled()) { -+ struct page_ext *page_ext = page_ext_get(page); -+ -+ if (page_ext) -+ return codetag_ref_from_page_ext(page_ext); -+ } -+ return NULL; -+} -+ -+static inline void put_page_tag_ref(union codetag_ref *ref) -+{ -+ page_ext_put(page_ext_from_codetag_ref(ref)); -+} -+ -+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, -+ unsigned int order) -+{ -+ union codetag_ref *ref = get_page_tag_ref(page); -+ -+ if (ref) { -+ alloc_tag_add(ref, task->alloc_tag, PAGE_SIZE << order); -+ put_page_tag_ref(ref); -+ } -+} -+ -+static inline void pgalloc_tag_sub(struct page *page, unsigned int order) -+{ -+ union codetag_ref *ref = get_page_tag_ref(page); -+ -+ if (ref) { -+ alloc_tag_sub(ref, PAGE_SIZE << order); -+ put_page_tag_ref(ref); -+ } -+} -+ -+static inline void pgalloc_tag_split(struct page *page, unsigned int nr) -+{ -+ int i; -+ struct page_ext *page_ext; -+ union codetag_ref *ref; -+ struct alloc_tag *tag; -+ -+ if (!mem_alloc_profiling_enabled()) -+ return; -+ -+ page_ext = page_ext_get(page); -+ if (unlikely(!page_ext)) -+ return; -+ -+ ref = codetag_ref_from_page_ext(page_ext); -+ if (!ref->ct) -+ goto out; -+ -+ tag = ct_to_alloc_tag(ref->ct); -+ page_ext = page_ext_next(page_ext); -+ for (i = 1; i < nr; i++) { -+ /* New reference with 0 bytes accounted */ -+ alloc_tag_add(codetag_ref_from_page_ext(page_ext), tag, 0); -+ page_ext = page_ext_next(page_ext); -+ } -+out: -+ page_ext_put(page_ext); -+} -+ -+#else /* CONFIG_MEM_ALLOC_PROFILING */ -+ -+static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; } -+static inline void put_page_tag_ref(union codetag_ref *ref) {} -+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, -+ unsigned int order) {} -+static inline void pgalloc_tag_sub(struct page *page, unsigned int order) {} -+static inline void pgalloc_tag_split(struct page *page, unsigned int nr) {} -+ -+#endif /* CONFIG_MEM_ALLOC_PROFILING */ -+ -+#endif /* _LINUX_PGALLOC_TAG_H */ diff --git a/include/linux/prandom.h b/include/linux/prandom.h index f2ed5b72b..f7f1e5251 100644 --- a/include/linux/prandom.h @@ -96213,43 +97227,8 @@ index f2ed5b72b..f7f1e5251 100644 #include struct rnd_state { -diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h -index 57467cbf4..92a8e670c 100644 ---- a/include/linux/rhashtable-types.h -+++ b/include/linux/rhashtable-types.h -@@ -9,6 +9,7 @@ - #ifndef _LINUX_RHASHTABLE_TYPES_H - #define _LINUX_RHASHTABLE_TYPES_H - -+#include - #include - #include - #include -@@ -88,6 +89,7 @@ struct rhashtable { - struct mutex mutex; - spinlock_t lock; - atomic_t nelems; -+ struct alloc_tag *alloc_tag; - }; - - /** -@@ -127,9 +129,12 @@ struct rhashtable_iter { - bool end_of_table; - }; - --int rhashtable_init(struct rhashtable *ht, -+int rhashtable_init_noprof(struct rhashtable *ht, - const struct rhashtable_params *params); --int rhltable_init(struct rhltable *hlt, -+#define rhashtable_init(...) alloc_hooks(rhashtable_init_noprof(__VA_ARGS__)) -+ -+int rhltable_init_noprof(struct rhltable *hlt, - const struct rhashtable_params *params); -+#define rhltable_init(...) alloc_hooks(rhltable_init_noprof(__VA_ARGS__)) - - #endif /* _LINUX_RHASHTABLE_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h -index 847332470..5c359b8b2 100644 +index 609bde814..a82f63541 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -20,7 +20,7 @@ @@ -96261,26 +97240,7 @@ index 847332470..5c359b8b2 100644 #include #include #include -@@ -763,6 +763,10 @@ struct task_struct { - unsigned int flags; - unsigned int ptrace; - -+#ifdef CONFIG_MEM_ALLOC_PROFILING -+ struct alloc_tag *alloc_tag; -+#endif -+ - #ifdef CONFIG_SMP - int on_cpu; - struct __call_single_node wake_entry; -@@ -802,6 +806,7 @@ struct task_struct { - struct task_group *sched_task_group; - #endif - -+ - #ifdef CONFIG_UCLAMP_TASK - /* - * Clamp values requested for a scheduling entity. -@@ -871,6 +876,7 @@ struct task_struct { +@@ -870,6 +870,7 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; @@ -96288,7 +97248,7 @@ index 847332470..5c359b8b2 100644 int exit_state; int exit_code; -@@ -1163,7 +1169,7 @@ struct task_struct { +@@ -1162,7 +1163,7 @@ struct task_struct { #endif #ifdef CONFIG_LOCKDEP @@ -96297,30 +97257,6 @@ index 847332470..5c359b8b2 100644 u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; -@@ -2446,4 +2452,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } - - extern void sched_set_stop_task(int cpu, struct task_struct *stop); - -+#ifdef CONFIG_MEM_ALLOC_PROFILING -+static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) -+{ -+ swap(current->alloc_tag, tag); -+ return tag; -+} -+ -+static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) -+{ -+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG -+ WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); -+#endif -+ current->alloc_tag = old; -+} -+#else -+static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; } -+#define alloc_tag_restore(_tag, _old) -+#endif -+ - #endif diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 515d7fcb9..cc02410f2 100644 --- a/include/linux/seq_buf.h @@ -96381,798 +97317,11 @@ index 224293b2d..a15a45d06 100644 #ifdef CONFIG_SHRINKER_DEBUG extern int shrinker_debugfs_add(struct shrinker *shrinker); -diff --git a/include/linux/six.h b/include/linux/six.h -new file mode 100644 -index 000000000..394da423c ---- /dev/null -+++ b/include/linux/six.h -@@ -0,0 +1,388 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+ -+#ifndef _LINUX_SIX_H -+#define _LINUX_SIX_H -+ -+/** -+ * DOC: SIX locks overview -+ * -+ * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores -+ * but with an additional state: read/shared, intent, exclusive/write -+ * -+ * The purpose of the intent state is to allow for greater concurrency on tree -+ * structures without deadlocking. In general, a read can't be upgraded to a -+ * write lock without deadlocking, so an operation that updates multiple nodes -+ * will have to take write locks for the full duration of the operation. -+ * -+ * But by adding an intent state, which is exclusive with other intent locks but -+ * not with readers, we can take intent locks at thte start of the operation, -+ * and then take write locks only for the actual update to each individual -+ * nodes, without deadlocking. -+ * -+ * Example usage: -+ * six_lock_read(&foo->lock); -+ * six_unlock_read(&foo->lock); -+ * -+ * An intent lock must be held before taking a write lock: -+ * six_lock_intent(&foo->lock); -+ * six_lock_write(&foo->lock); -+ * six_unlock_write(&foo->lock); -+ * six_unlock_intent(&foo->lock); -+ * -+ * Other operations: -+ * six_trylock_read() -+ * six_trylock_intent() -+ * six_trylock_write() -+ * -+ * six_lock_downgrade() convert from intent to read -+ * six_lock_tryupgrade() attempt to convert from read to intent, may fail -+ * -+ * There are also interfaces that take the lock type as an enum: -+ * -+ * six_lock_type(&foo->lock, SIX_LOCK_read); -+ * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent) -+ * six_lock_type(&foo->lock, SIX_LOCK_write); -+ * six_unlock_type(&foo->lock, SIX_LOCK_write); -+ * six_unlock_type(&foo->lock, SIX_LOCK_intent); -+ * -+ * Lock sequence numbers - unlock(), relock(): -+ * -+ * Locks embed sequences numbers, which are incremented on write lock/unlock. -+ * This allows locks to be dropped and the retaken iff the state they protect -+ * hasn't changed; this makes it much easier to avoid holding locks while e.g. -+ * doing IO or allocating memory. -+ * -+ * Example usage: -+ * six_lock_read(&foo->lock); -+ * u32 seq = six_lock_seq(&foo->lock); -+ * six_unlock_read(&foo->lock); -+ * -+ * some_operation_that_may_block(); -+ * -+ * if (six_relock_read(&foo->lock, seq)) { ... } -+ * -+ * If the relock operation succeeds, it is as if the lock was never unlocked. -+ * -+ * Reentrancy: -+ * -+ * Six locks are not by themselves reentrent, but have counters for both the -+ * read and intent states that can be used to provide reentrency by an upper -+ * layer that tracks held locks. If a lock is known to already be held in the -+ * read or intent state, six_lock_increment() can be used to bump the "lock -+ * held in this state" counter, increasing the number of unlock calls that -+ * will be required to fully unlock it. -+ * -+ * Example usage: -+ * six_lock_read(&foo->lock); -+ * six_lock_increment(&foo->lock, SIX_LOCK_read); -+ * six_unlock_read(&foo->lock); -+ * six_unlock_read(&foo->lock); -+ * foo->lock is now fully unlocked. -+ * -+ * Since the intent state supercedes read, it's legal to increment the read -+ * counter when holding an intent lock, but not the reverse. -+ * -+ * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write) -+ * is not legal. -+ * -+ * should_sleep_fn: -+ * -+ * There is a six_lock() variant that takes a function pointer that is called -+ * immediately prior to schedule() when blocking, and may return an error to -+ * abort. -+ * -+ * One possible use for this feature is when objects being locked are part of -+ * a cache and may reused, and lock ordering is based on a property of the -+ * object that will change when the object is reused - i.e. logical key order. -+ * -+ * If looking up an object in the cache may race with object reuse, and lock -+ * ordering is required to prevent deadlock, object reuse may change the -+ * correct lock order for that object and cause a deadlock. should_sleep_fn -+ * can be used to check if the object is still the object we want and avoid -+ * this deadlock. -+ * -+ * Wait list entry interface: -+ * -+ * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a -+ * wait list entry. By embedding six_lock_waiter into another object, and by -+ * traversing lock waitlists, it is then possible for an upper layer to -+ * implement full cycle detection for deadlock avoidance. -+ * -+ * should_sleep_fn should be used for invoking the cycle detector, walking the -+ * graph of held locks to check for a deadlock. The upper layer must track -+ * held locks for each thread, and each thread's held locks must be reachable -+ * from its six_lock_waiter object. -+ * -+ * six_lock_waiter() will add the wait object to the waitlist re-trying taking -+ * the lock, and before calling should_sleep_fn, and the wait object will not -+ * be removed from the waitlist until either the lock has been successfully -+ * acquired, or we aborted because should_sleep_fn returned an error. -+ * -+ * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will -+ * have timestamps in strictly ascending order - this is so the timestamp can -+ * be used as a cursor for lock graph traverse. -+ */ -+ -+#include -+#include -+#include -+#include -+ -+enum six_lock_type { -+ SIX_LOCK_read, -+ SIX_LOCK_intent, -+ SIX_LOCK_write, -+}; -+ -+struct six_lock { -+ atomic_t state; -+ u32 seq; -+ unsigned intent_lock_recurse; -+ struct task_struct *owner; -+ unsigned __percpu *readers; -+ struct optimistic_spin_queue osq; -+ raw_spinlock_t wait_lock; -+ struct list_head wait_list; -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ struct lockdep_map dep_map; -+#endif -+}; -+ -+struct six_lock_waiter { -+ struct list_head list; -+ struct task_struct *task; -+ enum six_lock_type lock_want; -+ bool lock_acquired; -+ u64 start_time; -+}; -+ -+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); -+ -+void six_lock_exit(struct six_lock *lock); -+ -+enum six_lock_init_flags { -+ SIX_LOCK_INIT_PCPU = 1U << 0, -+}; -+ -+void __six_lock_init(struct six_lock *lock, const char *name, -+ struct lock_class_key *key, enum six_lock_init_flags flags); -+ -+/** -+ * six_lock_init - initialize a six lock -+ * @lock: lock to initialize -+ * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU -+ */ -+#define six_lock_init(lock, flags) \ -+do { \ -+ static struct lock_class_key __key; \ -+ \ -+ __six_lock_init((lock), #lock, &__key, flags); \ -+} while (0) -+ -+/** -+ * six_lock_seq - obtain current lock sequence number -+ * @lock: six_lock to obtain sequence number for -+ * -+ * @lock should be held for read or intent, and not write -+ * -+ * By saving the lock sequence number, we can unlock @lock and then (typically -+ * after some blocking operation) attempt to relock it: the relock will succeed -+ * if the sequence number hasn't changed, meaning no write locks have been taken -+ * and state corresponding to what @lock protects is still valid. -+ */ -+static inline u32 six_lock_seq(const struct six_lock *lock) -+{ -+ return lock->seq; -+} -+ -+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); -+ -+/** -+ * six_trylock_type - attempt to take a six lock without blocking -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * -+ * Return: true on success, false on failure. -+ */ -+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ return six_trylock_ip(lock, type, _THIS_IP_); -+} -+ -+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, -+ struct six_lock_waiter *wait, -+ six_lock_should_sleep_fn should_sleep_fn, void *p, -+ unsigned long ip); -+ -+/** -+ * six_lock_waiter - take a lock, with full waitlist interface -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @wait: pointer to wait object, which will be added to lock's waitlist -+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior -+ * to scheduling -+ * @p: passed through to @should_sleep_fn -+ * -+ * This is a convenience wrapper around six_lock_ip_waiter(), see that function -+ * for full documentation. -+ * -+ * Return: 0 on success, or the return code from @should_sleep_fn on failure. -+ */ -+static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type, -+ struct six_lock_waiter *wait, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_); -+} -+ -+/** -+ * six_lock_ip - take a six lock lock -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior -+ * to scheduling -+ * @p: passed through to @should_sleep_fn -+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ -+ * -+ * Return: 0 on success, or the return code from @should_sleep_fn on failure. -+ */ -+static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p, -+ unsigned long ip) -+{ -+ struct six_lock_waiter wait; -+ -+ return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip); -+} -+ -+/** -+ * six_lock_type - take a six lock lock -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior -+ * to scheduling -+ * @p: passed through to @should_sleep_fn -+ * -+ * Return: 0 on success, or the return code from @should_sleep_fn on failure. -+ */ -+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ struct six_lock_waiter wait; -+ -+ return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_); -+} -+ -+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, -+ unsigned seq, unsigned long ip); -+ -+/** -+ * six_relock_type - attempt to re-take a lock that was held previously -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @seq: lock sequence number obtained from six_lock_seq() while lock was -+ * held previously -+ * -+ * Return: true on success, false on failure. -+ */ -+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, -+ unsigned seq) -+{ -+ return six_relock_ip(lock, type, seq, _THIS_IP_); -+} -+ -+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); -+ -+/** -+ * six_unlock_type - drop a six lock -+ * @lock: lock to unlock -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * -+ * When a lock is held multiple times (because six_lock_incement()) was used), -+ * this decrements the 'lock held' counter by one. -+ * -+ * For example: -+ * six_lock_read(&foo->lock); read count 1 -+ * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 -+ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 -+ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 -+ */ -+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ six_unlock_ip(lock, type, _THIS_IP_); -+} -+ -+#define __SIX_LOCK(type) \ -+static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\ -+{ \ -+ return six_trylock_ip(lock, SIX_LOCK_##type, ip); \ -+} \ -+ \ -+static inline bool six_trylock_##type(struct six_lock *lock) \ -+{ \ -+ return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ -+} \ -+ \ -+static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \ -+ struct six_lock_waiter *wait, \ -+ six_lock_should_sleep_fn should_sleep_fn, void *p,\ -+ unsigned long ip) \ -+{ \ -+ return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ -+} \ -+ \ -+static inline int six_lock_ip_##type(struct six_lock *lock, \ -+ six_lock_should_sleep_fn should_sleep_fn, void *p, \ -+ unsigned long ip) \ -+{ \ -+ return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ -+} \ -+ \ -+static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ -+{ \ -+ return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \ -+} \ -+ \ -+static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \ -+{ \ -+ return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \ -+} \ -+ \ -+static inline int six_lock_##type(struct six_lock *lock, \ -+ six_lock_should_sleep_fn fn, void *p)\ -+{ \ -+ return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \ -+} \ -+ \ -+static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ -+{ \ -+ six_unlock_ip(lock, SIX_LOCK_##type, ip); \ -+} \ -+ \ -+static inline void six_unlock_##type(struct six_lock *lock) \ -+{ \ -+ six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ -+} -+ -+__SIX_LOCK(read) -+__SIX_LOCK(intent) -+__SIX_LOCK(write) -+#undef __SIX_LOCK -+ -+void six_lock_downgrade(struct six_lock *); -+bool six_lock_tryupgrade(struct six_lock *); -+bool six_trylock_convert(struct six_lock *, enum six_lock_type, -+ enum six_lock_type); -+ -+void six_lock_increment(struct six_lock *, enum six_lock_type); -+ -+void six_lock_wakeup_all(struct six_lock *); -+ -+struct six_lock_count { -+ unsigned n[3]; -+}; -+ -+struct six_lock_count six_lock_counts(struct six_lock *); -+void six_lock_readers_add(struct six_lock *, int); -+ -+#endif /* _LINUX_SIX_H */ -diff --git a/include/linux/slab.h b/include/linux/slab.h -index 6b3e155b7..f7bc3ab70 100644 ---- a/include/linux/slab.h -+++ b/include/linux/slab.h -@@ -147,6 +147,13 @@ - #endif - #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ - -+#ifdef CONFIG_SLAB_OBJ_EXT -+/* Slab created using create_boot_cache */ -+#define SLAB_NO_OBJ_EXT ((slab_flags_t __force)0x20000000U) -+#else -+#define SLAB_NO_OBJ_EXT 0 -+#endif -+ - /* - * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. - * -@@ -206,7 +213,9 @@ int kmem_cache_shrink(struct kmem_cache *s); - /* - * Common kmalloc functions provided by all allocators - */ --void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2); -+void * __must_check krealloc_noprof(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2); -+#define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__)) -+ - void kfree(const void *objp); - void kfree_sensitive(const void *objp); - size_t __ksize(const void *objp); -@@ -444,7 +453,10 @@ static __always_inline unsigned int __kmalloc_index(size_t size, - static_assert(PAGE_SHIFT <= 20); - #define kmalloc_index(s) __kmalloc_index(s, true) - --void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); -+#include -+ -+void *__kmalloc_noprof(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); -+#define __kmalloc(...) alloc_hooks(__kmalloc_noprof(__VA_ARGS__)) - - /** - * kmem_cache_alloc - Allocate an object -@@ -456,9 +468,13 @@ void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_siz - * - * Return: pointer to the new object or %NULL in case of error - */ --void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) __assume_slab_alignment __malloc; --void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, -- gfp_t gfpflags) __assume_slab_alignment __malloc; -+void *kmem_cache_alloc_noprof(struct kmem_cache *cachep, gfp_t flags) __assume_slab_alignment __malloc; -+#define kmem_cache_alloc(...) alloc_hooks(kmem_cache_alloc_noprof(__VA_ARGS__)) -+ -+void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, -+ gfp_t gfpflags) __assume_slab_alignment __malloc; -+#define kmem_cache_alloc_lru(...) alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__)) -+ - void kmem_cache_free(struct kmem_cache *s, void *objp); - - /* -@@ -469,29 +485,40 @@ void kmem_cache_free(struct kmem_cache *s, void *objp); - * Note that interrupts must be enabled when calling these functions. - */ - void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); --int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p); -+ -+int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p); -+#define kmem_cache_alloc_bulk(...) alloc_hooks(kmem_cache_alloc_bulk_noprof(__VA_ARGS__)) - - static __always_inline void kfree_bulk(size_t size, void **p) - { - kmem_cache_free_bulk(NULL, size, p); - } - --void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment -+void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment - __alloc_size(1); --void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment -- __malloc; -+#define __kmalloc_node(...) alloc_hooks(__kmalloc_node_noprof(__VA_ARGS__)) - --void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) -+void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment -+ __malloc; -+#define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__)) -+ -+void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t flags, size_t size) - __assume_kmalloc_alignment __alloc_size(3); - --void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, -- int node, size_t size) __assume_kmalloc_alignment -+void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, -+ int node, size_t size) __assume_kmalloc_alignment - __alloc_size(4); --void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment -+#define kmalloc_trace(...) alloc_hooks(kmalloc_trace_noprof(__VA_ARGS__)) -+ -+#define kmalloc_node_trace(...) alloc_hooks(kmalloc_node_trace_noprof(__VA_ARGS__)) -+ -+void *kmalloc_large_noprof(size_t size, gfp_t flags) __assume_page_alignment - __alloc_size(1); -+#define kmalloc_large(...) alloc_hooks(kmalloc_large_noprof(__VA_ARGS__)) - --void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_alignment -+void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) __assume_page_alignment - __alloc_size(1); -+#define kmalloc_large_node(...) alloc_hooks(kmalloc_large_node_noprof(__VA_ARGS__)) - - /** - * kmalloc - allocate kernel memory -@@ -547,37 +574,39 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align - * Try really hard to succeed the allocation but fail - * eventually. - */ --static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) -+static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t flags) - { - if (__builtin_constant_p(size) && size) { - unsigned int index; - - if (size > KMALLOC_MAX_CACHE_SIZE) -- return kmalloc_large(size, flags); -+ return kmalloc_large_noprof(size, flags); - - index = kmalloc_index(size); -- return kmalloc_trace( -+ return kmalloc_trace_noprof( - kmalloc_caches[kmalloc_type(flags)][index], - flags, size); - } -- return __kmalloc(size, flags); -+ return __kmalloc_noprof(size, flags); - } -+#define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__)) - --static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) -+static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gfp_t flags, int node) - { - if (__builtin_constant_p(size) && size) { - unsigned int index; - - if (size > KMALLOC_MAX_CACHE_SIZE) -- return kmalloc_large_node(size, flags, node); -+ return kmalloc_large_node_noprof(size, flags, node); - - index = kmalloc_index(size); -- return kmalloc_node_trace( -+ return kmalloc_node_trace_noprof( - kmalloc_caches[kmalloc_type(flags)][index], - flags, node, size); - } -- return __kmalloc_node(size, flags, node); -+ return __kmalloc_node_noprof(size, flags, node); - } -+#define kmalloc_node(...) alloc_hooks(kmalloc_node_noprof(__VA_ARGS__)) - - /** - * kmalloc_array - allocate memory for an array. -@@ -585,16 +614,17 @@ static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t fla - * @size: element size. - * @flags: the type of memory to allocate (see kmalloc). - */ --static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_t flags) -+static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t size, gfp_t flags) - { - size_t bytes; - - if (unlikely(check_mul_overflow(n, size, &bytes))) - return NULL; - if (__builtin_constant_p(n) && __builtin_constant_p(size)) -- return kmalloc(bytes, flags); -- return __kmalloc(bytes, flags); -+ return kmalloc_noprof(bytes, flags); -+ return kmalloc_noprof(bytes, flags); - } -+#define kmalloc_array(...) alloc_hooks(kmalloc_array_noprof(__VA_ARGS__)) - - /** - * krealloc_array - reallocate memory for an array. -@@ -603,18 +633,19 @@ static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_ - * @new_size: new size of a single member of the array - * @flags: the type of memory to allocate (see kmalloc) - */ --static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p, -- size_t new_n, -- size_t new_size, -- gfp_t flags) -+static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p, -+ size_t new_n, -+ size_t new_size, -+ gfp_t flags) - { - size_t bytes; - - if (unlikely(check_mul_overflow(new_n, new_size, &bytes))) - return NULL; - -- return krealloc(p, bytes, flags); -+ return krealloc_noprof(p, bytes, flags); - } -+#define krealloc_array(...) alloc_hooks(krealloc_array_noprof(__VA_ARGS__)) - - /** - * kcalloc - allocate memory for an array. The memory is set to zero. -@@ -622,16 +653,11 @@ static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p, - * @size: element size. - * @flags: the type of memory to allocate (see kmalloc). - */ --static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flags) --{ -- return kmalloc_array(n, size, flags | __GFP_ZERO); --} -+#define kcalloc(_n, _size, _flags) kmalloc_array(_n, _size, (_flags) | __GFP_ZERO) - --void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, -+void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, int node, - unsigned long caller) __alloc_size(1); --#define kmalloc_node_track_caller(size, flags, node) \ -- __kmalloc_node_track_caller(size, flags, node, \ -- _RET_IP_) -+#define kmalloc_node_track_caller(...) alloc_hooks(kmalloc_node_track_caller_noprof(__VA_ARGS__, _RET_IP_)) - - /* - * kmalloc_track_caller is a special version of kmalloc that records the -@@ -641,11 +667,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, - * allocator where we care about the real place the memory allocation - * request comes from. - */ --#define kmalloc_track_caller(size, flags) \ -- __kmalloc_node_track_caller(size, flags, \ -- NUMA_NO_NODE, _RET_IP_) -+#define kmalloc_track_caller(...) kmalloc_node_track_caller(__VA_ARGS__, NUMA_NO_NODE) - --static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, -+static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, - int node) - { - size_t bytes; -@@ -653,75 +677,51 @@ static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, - if (unlikely(check_mul_overflow(n, size, &bytes))) - return NULL; - if (__builtin_constant_p(n) && __builtin_constant_p(size)) -- return kmalloc_node(bytes, flags, node); -- return __kmalloc_node(bytes, flags, node); -+ return kmalloc_node_noprof(bytes, flags, node); -+ return __kmalloc_node_noprof(bytes, flags, node); - } -+#define kmalloc_array_node(...) alloc_hooks(kmalloc_array_node_noprof(__VA_ARGS__)) - --static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) --{ -- return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); --} -+#define kcalloc_node(_n, _size, _flags, _node) kmalloc_array_node(_n, _size, (_flags) | __GFP_ZERO, _node) - - /* - * Shortcuts - */ --static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags) --{ -- return kmem_cache_alloc(k, flags | __GFP_ZERO); --} -+#define kmem_cache_zalloc(_k, _flags) kmem_cache_alloc(_k, (_flags)|__GFP_ZERO) - - /** - * kzalloc - allocate memory. The memory is set to zero. - * @size: how many bytes of memory are required. - * @flags: the type of memory to allocate (see kmalloc). - */ --static inline __alloc_size(1) void *kzalloc(size_t size, gfp_t flags) --{ -- return kmalloc(size, flags | __GFP_ZERO); --} -- --/** -- * kzalloc_node - allocate zeroed memory from a particular memory node. -- * @size: how many bytes of memory are required. -- * @flags: the type of memory to allocate (see kmalloc). -- * @node: memory node from which to allocate -- */ --static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int node) -+static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags) - { -- return kmalloc_node(size, flags | __GFP_ZERO, node); -+ return kmalloc_noprof(size, flags | __GFP_ZERO); - } -+#define kzalloc(...) alloc_hooks(kzalloc_noprof(__VA_ARGS__)) -+#define kzalloc_node(_size, _flags, _node) kmalloc_node(_size, (_flags)|__GFP_ZERO, _node) - --extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __alloc_size(1); --static inline __alloc_size(1) void *kvmalloc(size_t size, gfp_t flags) --{ -- return kvmalloc_node(size, flags, NUMA_NO_NODE); --} --static inline __alloc_size(1) void *kvzalloc_node(size_t size, gfp_t flags, int node) --{ -- return kvmalloc_node(size, flags | __GFP_ZERO, node); --} --static inline __alloc_size(1) void *kvzalloc(size_t size, gfp_t flags) --{ -- return kvmalloc(size, flags | __GFP_ZERO); --} -+extern void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) __alloc_size(1); -+#define kvmalloc_node(...) alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__)) - --static inline __alloc_size(1, 2) void *kvmalloc_array(size_t n, size_t size, gfp_t flags) --{ -- size_t bytes; -+#define kvmalloc(_size, _flags) kvmalloc_node(_size, _flags, NUMA_NO_NODE) -+#define kvzalloc(_size, _flags) kvmalloc(_size, _flags|__GFP_ZERO) - -- if (unlikely(check_mul_overflow(n, size, &bytes))) -- return NULL; -+#define kvzalloc_node(_size, _flags, _node) kvmalloc_node(_size, _flags|__GFP_ZERO, _node) - -- return kvmalloc(bytes, flags); --} -+#define kvmalloc_array(_n, _size, _flags) \ -+({ \ -+ size_t _bytes; \ -+ \ -+ !check_mul_overflow(_n, _size, &_bytes) ? kvmalloc(_bytes, _flags) : NULL; \ -+}) - --static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t flags) --{ -- return kvmalloc_array(n, size, flags | __GFP_ZERO); --} -+#define kvcalloc(_n, _size, _flags) kvmalloc_array(_n, _size, _flags|__GFP_ZERO) - --extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) -+extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) - __realloc_size(3); -+#define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__)) -+ - extern void kvfree(const void *addr); - extern void kvfree_sensitive(const void *addr, size_t len); - -diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h -index a61e7d55d..23f14dcb8 100644 ---- a/include/linux/slab_def.h -+++ b/include/linux/slab_def.h -@@ -107,7 +107,7 @@ static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *sla - * reciprocal_divide(offset, cache->reciprocal_buffer_size) - */ - static inline unsigned int obj_to_index(const struct kmem_cache *cache, -- const struct slab *slab, void *obj) -+ const struct slab *slab, const void *obj) - { - u32 offset = (obj - slab->s_mem); - return reciprocal_divide(offset, cache->reciprocal_buffer_size); -diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h -index f6df03f93..e8be5b368 100644 ---- a/include/linux/slub_def.h -+++ b/include/linux/slub_def.h -@@ -176,14 +176,14 @@ static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *sla - - /* Determine object index from a given position */ - static inline unsigned int __obj_to_index(const struct kmem_cache *cache, -- void *addr, void *obj) -+ void *addr, const void *obj) - { - return reciprocal_divide(kasan_reset_tag(obj) - addr, - cache->reciprocal_size); - } - - static inline unsigned int obj_to_index(const struct kmem_cache *cache, -- const struct slab *slab, void *obj) -+ const struct slab *slab, const void *obj) - { - if (is_kfence_address(obj)) - return 0; -diff --git a/include/linux/string.h b/include/linux/string.h -index c062c581a..198ca51ed 100644 ---- a/include/linux/string.h -+++ b/include/linux/string.h -@@ -96,6 +96,7 @@ extern char * strpbrk(const char *,const char *); - #ifndef __HAVE_ARCH_STRSEP - extern char * strsep(char **,const char *); - #endif -+extern char *strsep_no_empty(char **, const char *); - #ifndef __HAVE_ARCH_STRSPN - extern __kernel_size_t strspn(const char *,const char *); - #endif -@@ -176,7 +177,9 @@ extern void kfree_const(const void *x); - extern char *kstrdup(const char *s, gfp_t gfp) __malloc; - extern const char *kstrdup_const(const char *s, gfp_t gfp); - extern char *kstrndup(const char *s, size_t len, gfp_t gfp); --extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); -+extern void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) __realloc_size(2); -+#define kmemdup(...) alloc_hooks(kmemdup_noprof(__VA_ARGS__)) -+ - extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); - extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); - diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h -index fae6beaaa..ae51580b9 100644 +index 789ab3004..1cc137402 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h -@@ -16,15 +16,14 @@ static inline bool string_is_terminated(const char *s, int len) +@@ -17,15 +17,14 @@ static inline bool string_is_terminated(const char *s, int len) return memchr(s, '\0', len) ? true : false; } @@ -97194,121 +97343,6 @@ index fae6beaaa..ae51580b9 100644 int parse_int_array_user(const char __user *from, size_t count, int **array); -diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h -index bb9d3f554..d8e0cacfc 100644 ---- a/include/linux/time_namespace.h -+++ b/include/linux/time_namespace.h -@@ -11,6 +11,8 @@ - struct user_namespace; - extern struct user_namespace init_user_ns; - -+struct vm_area_struct; -+ - struct timens_offsets { - struct timespec64 monotonic; - struct timespec64 boottime; -diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h -index c720be70c..106d78e75 100644 ---- a/include/linux/vmalloc.h -+++ b/include/linux/vmalloc.h -@@ -2,6 +2,8 @@ - #ifndef _LINUX_VMALLOC_H - #define _LINUX_VMALLOC_H - -+#include -+#include - #include - #include - #include -@@ -137,26 +139,54 @@ extern unsigned long vmalloc_nr_pages(void); - static inline unsigned long vmalloc_nr_pages(void) { return 0; } - #endif - --extern void *vmalloc(unsigned long size) __alloc_size(1); --extern void *vzalloc(unsigned long size) __alloc_size(1); --extern void *vmalloc_user(unsigned long size) __alloc_size(1); --extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1); --extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1); --extern void *vmalloc_32(unsigned long size) __alloc_size(1); --extern void *vmalloc_32_user(unsigned long size) __alloc_size(1); --extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1); --extern void *__vmalloc_node_range(unsigned long size, unsigned long align, -+extern void *vmalloc_noprof(unsigned long size) __alloc_size(1); -+#define vmalloc(...) alloc_hooks(vmalloc_noprof(__VA_ARGS__)) -+ -+extern void *vzalloc_noprof(unsigned long size) __alloc_size(1); -+#define vzalloc(...) alloc_hooks(vzalloc_noprof(__VA_ARGS__)) -+ -+extern void *vmalloc_user_noprof(unsigned long size) __alloc_size(1); -+#define vmalloc_user(...) alloc_hooks(vmalloc_user_noprof(__VA_ARGS__)) -+ -+extern void *vmalloc_node_noprof(unsigned long size, int node) __alloc_size(1); -+#define vmalloc_node(...) alloc_hooks(vmalloc_node_noprof(__VA_ARGS__)) -+ -+extern void *vzalloc_node_noprof(unsigned long size, int node) __alloc_size(1); -+#define vzalloc_node(...) alloc_hooks(vzalloc_node_noprof(__VA_ARGS__)) -+ -+extern void *vmalloc_32_noprof(unsigned long size) __alloc_size(1); -+#define vmalloc_32(...) alloc_hooks(vmalloc_32_noprof(__VA_ARGS__)) -+ -+extern void *vmalloc_32_user_noprof(unsigned long size) __alloc_size(1); -+#define vmalloc_32_user(...) alloc_hooks(vmalloc_32_user_noprof(__VA_ARGS__)) -+ -+extern void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1); -+#define __vmalloc(...) alloc_hooks(__vmalloc_noprof(__VA_ARGS__)) -+ -+extern void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, - unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, unsigned long vm_flags, int node, - const void *caller) __alloc_size(1); --void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, -+#define __vmalloc_node_range(...) alloc_hooks(__vmalloc_node_range_noprof(__VA_ARGS__)) -+ -+void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, - int node, const void *caller) __alloc_size(1); --void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1); -+#define __vmalloc_node(...) alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__)) -+ -+void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1); -+#define vmalloc_huge(...) alloc_hooks(vmalloc_huge_noprof(__VA_ARGS__)) -+ -+extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); -+#define __vmalloc_array(...) alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__)) -+ -+extern void *vmalloc_array_noprof(size_t n, size_t size) __alloc_size(1, 2); -+#define vmalloc_array(...) alloc_hooks(vmalloc_array_noprof(__VA_ARGS__)) -+ -+extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); -+#define __vcalloc(...) alloc_hooks(__vcalloc_noprof(__VA_ARGS__)) - --extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); --extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2); --extern void *__vcalloc(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); --extern void *vcalloc(size_t n, size_t size) __alloc_size(1, 2); -+extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2); -+#define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__)) - - extern void vfree(const void *addr); - extern void vfree_atomic(const void *addr); -diff --git a/init/Kconfig b/init/Kconfig -index b6d38eccc..cec6bac1a 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -940,10 +940,14 @@ config CGROUP_FAVOR_DYNMODS - - Say N if unsure. - -+config SLAB_OBJ_EXT -+ bool -+ - config MEMCG - bool "Memory controller" - select PAGE_COUNTER - select EVENTFD -+ select SLAB_OBJ_EXT - help - Provides control over the memory footprint of tasks in a cgroup. - diff --git a/init/init_task.c b/init/init_task.c index ff6c4b9bf..f703116e0 100644 --- a/init/init_task.c @@ -97321,57 +97355,21 @@ index ff6c4b9bf..f703116e0 100644 .restart_block = { .fn = do_no_restart_syscall, }, -diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks -index 4198f0273..b2abd9a5d 100644 ---- a/kernel/Kconfig.locks -+++ b/kernel/Kconfig.locks -@@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB - config MMIOWB - def_bool y if ARCH_HAS_MMIOWB - depends on SMP -+ -+config SIXLOCKS -+ bool -diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c -index 9a4db5cce..fc42930af 100644 ---- a/kernel/dma/mapping.c -+++ b/kernel/dma/mapping.c -@@ -570,9 +570,9 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size, - size = PAGE_ALIGN(size); - if (dma_alloc_direct(dev, ops)) - return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); -- if (!ops->alloc_pages) -+ if (!ops->alloc_pages_op) - return NULL; -- return ops->alloc_pages(dev, size, dma_handle, dir, gfp); -+ return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp); - } - - struct page *dma_alloc_pages(struct device *dev, size_t size, -diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile -index 0db4093d1..a095dbbf0 100644 ---- a/kernel/locking/Makefile -+++ b/kernel/locking/Makefile -@@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o - obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o - obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o - obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o -+obj-$(CONFIG_SIXLOCKS) += six.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c -index 4dfd2f3e0..0463302e2 100644 +index 111607d91..b6c3a8788 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c -@@ -3039,6 +3039,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) - if ((next->read == 2) && prev->read) - continue; +@@ -3056,6 +3056,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) -+ if (hlock_class(next)->no_check_recursion) + class = hlock_class(prev); + ++ if (class->no_check_recursion) + continue; + - /* - * We're holding the nest_lock, which serializes this lock's - * nesting behaviour. -@@ -3100,6 +3103,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, + if (class->cmp_fn && + class->cmp_fn(prev->instance, next->instance) < 0) + continue; +@@ -3121,6 +3124,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return 2; } @@ -97379,10 +97377,10 @@ index 4dfd2f3e0..0463302e2 100644 + hlock_class(prev)->no_check_recursion) + return 2; + - /* - * Prove that the new -> dependency would not - * create a circular dependency in the graph. (We do this by -@@ -6551,6 +6558,26 @@ void debug_check_no_locks_held(void) + if (prev->class_idx == next->class_idx) { + struct lock_class *class = hlock_class(prev); + +@@ -6607,6 +6614,26 @@ void debug_check_no_locks_held(void) } EXPORT_SYMBOL_GPL(debug_check_no_locks_held); @@ -97409,7 +97407,7 @@ index 4dfd2f3e0..0463302e2 100644 #ifdef __KERNEL__ void debug_show_all_locks(void) { -@@ -6664,3 +6691,22 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) +@@ -6720,3 +6747,22 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) warn_rcu_exit(rcu); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); @@ -97432,6 +97430,20 @@ index 4dfd2f3e0..0463302e2 100644 +} +EXPORT_SYMBOL_GPL(lockdep_set_no_check_recursion); +#endif +diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c +index d973fe604..2deeeca3e 100644 +--- a/kernel/locking/mutex.c ++++ b/kernel/locking/mutex.c +@@ -1126,6 +1126,9 @@ EXPORT_SYMBOL(ww_mutex_lock_interruptible); + #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ + #endif /* !CONFIG_PREEMPT_RT */ + ++EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin); ++EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end); ++ + /** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index d5610ad52..b752ec5cc 100644 --- a/kernel/locking/osq_lock.c @@ -97449,1001 +97461,6 @@ index d5610ad52..b752ec5cc 100644 WRITE_ONCE(next->locked, 1); } +EXPORT_SYMBOL_GPL(osq_unlock); -diff --git a/kernel/locking/six.c b/kernel/locking/six.c -new file mode 100644 -index 000000000..0b9c4bb7c ---- /dev/null -+++ b/kernel/locking/six.c -@@ -0,0 +1,893 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#ifdef DEBUG -+#define EBUG_ON(cond) BUG_ON(cond) -+#else -+#define EBUG_ON(cond) do {} while (0) -+#endif -+ -+#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip) -+#define six_release(l, ip) lock_release(l, ip) -+ -+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); -+ -+#define SIX_LOCK_HELD_read_OFFSET 0 -+#define SIX_LOCK_HELD_read ~(~0U << 26) -+#define SIX_LOCK_HELD_intent (1U << 26) -+#define SIX_LOCK_HELD_write (1U << 27) -+#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read)) -+#define SIX_LOCK_WAITING_intent (1U << (28 + SIX_LOCK_intent)) -+#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write)) -+#define SIX_LOCK_NOSPIN (1U << 31) -+ -+struct six_lock_vals { -+ /* Value we add to the lock in order to take the lock: */ -+ u32 lock_val; -+ -+ /* If the lock has this value (used as a mask), taking the lock fails: */ -+ u32 lock_fail; -+ -+ /* Mask that indicates lock is held for this type: */ -+ u32 held_mask; -+ -+ /* Waitlist we wakeup when releasing the lock: */ -+ enum six_lock_type unlock_wakeup; -+}; -+ -+static const struct six_lock_vals l[] = { -+ [SIX_LOCK_read] = { -+ .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET, -+ .lock_fail = SIX_LOCK_HELD_write, -+ .held_mask = SIX_LOCK_HELD_read, -+ .unlock_wakeup = SIX_LOCK_write, -+ }, -+ [SIX_LOCK_intent] = { -+ .lock_val = SIX_LOCK_HELD_intent, -+ .lock_fail = SIX_LOCK_HELD_intent, -+ .held_mask = SIX_LOCK_HELD_intent, -+ .unlock_wakeup = SIX_LOCK_intent, -+ }, -+ [SIX_LOCK_write] = { -+ .lock_val = SIX_LOCK_HELD_write, -+ .lock_fail = SIX_LOCK_HELD_read, -+ .held_mask = SIX_LOCK_HELD_write, -+ .unlock_wakeup = SIX_LOCK_read, -+ }, -+}; -+ -+static inline void six_set_bitmask(struct six_lock *lock, u32 mask) -+{ -+ if ((atomic_read(&lock->state) & mask) != mask) -+ atomic_or(mask, &lock->state); -+} -+ -+static inline void six_clear_bitmask(struct six_lock *lock, u32 mask) -+{ -+ if (atomic_read(&lock->state) & mask) -+ atomic_and(~mask, &lock->state); -+} -+ -+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, -+ u32 old, struct task_struct *owner) -+{ -+ if (type != SIX_LOCK_intent) -+ return; -+ -+ if (!(old & SIX_LOCK_HELD_intent)) { -+ EBUG_ON(lock->owner); -+ lock->owner = owner; -+ } else { -+ EBUG_ON(lock->owner != current); -+ } -+} -+ -+static inline unsigned pcpu_read_count(struct six_lock *lock) -+{ -+ unsigned read_count = 0; -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ read_count += *per_cpu_ptr(lock->readers, cpu); -+ return read_count; -+} -+ -+/* -+ * __do_six_trylock() - main trylock routine -+ * -+ * Returns 1 on success, 0 on failure -+ * -+ * In percpu reader mode, a failed trylock may cause a spurious trylock failure -+ * for anoter thread taking the competing lock type, and we may havve to do a -+ * wakeup: when a wakeup is required, we return -1 - wakeup_type. -+ */ -+static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, -+ struct task_struct *task, bool try) -+{ -+ int ret; -+ u32 old; -+ -+ EBUG_ON(type == SIX_LOCK_write && lock->owner != task); -+ EBUG_ON(type == SIX_LOCK_write && -+ (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write))); -+ -+ /* -+ * Percpu reader mode: -+ * -+ * The basic idea behind this algorithm is that you can implement a lock -+ * between two threads without any atomics, just memory barriers: -+ * -+ * For two threads you'll need two variables, one variable for "thread a -+ * has the lock" and another for "thread b has the lock". -+ * -+ * To take the lock, a thread sets its variable indicating that it holds -+ * the lock, then issues a full memory barrier, then reads from the -+ * other thread's variable to check if the other thread thinks it has -+ * the lock. If we raced, we backoff and retry/sleep. -+ * -+ * Failure to take the lock may cause a spurious trylock failure in -+ * another thread, because we temporarily set the lock to indicate that -+ * we held it. This would be a problem for a thread in six_lock(), when -+ * they are calling trylock after adding themself to the waitlist and -+ * prior to sleeping. -+ * -+ * Therefore, if we fail to get the lock, and there were waiters of the -+ * type we conflict with, we will have to issue a wakeup. -+ * -+ * Since we may be called under wait_lock (and by the wakeup code -+ * itself), we return that the wakeup has to be done instead of doing it -+ * here. -+ */ -+ if (type == SIX_LOCK_read && lock->readers) { -+ preempt_disable(); -+ this_cpu_inc(*lock->readers); /* signal that we own lock */ -+ -+ smp_mb(); -+ -+ old = atomic_read(&lock->state); -+ ret = !(old & l[type].lock_fail); -+ -+ this_cpu_sub(*lock->readers, !ret); -+ preempt_enable(); -+ -+ if (!ret && (old & SIX_LOCK_WAITING_write)) -+ ret = -1 - SIX_LOCK_write; -+ } else if (type == SIX_LOCK_write && lock->readers) { -+ if (try) { -+ atomic_add(SIX_LOCK_HELD_write, &lock->state); -+ smp_mb__after_atomic(); -+ } -+ -+ ret = !pcpu_read_count(lock); -+ -+ if (try && !ret) { -+ old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state); -+ if (old & SIX_LOCK_WAITING_read) -+ ret = -1 - SIX_LOCK_read; -+ } -+ } else { -+ old = atomic_read(&lock->state); -+ do { -+ ret = !(old & l[type].lock_fail); -+ if (!ret || (type == SIX_LOCK_write && !try)) { -+ smp_mb(); -+ break; -+ } -+ } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val)); -+ -+ EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask)); -+ } -+ -+ if (ret > 0) -+ six_set_owner(lock, type, old, task); -+ -+ EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 && -+ (atomic_read(&lock->state) & SIX_LOCK_HELD_write)); -+ -+ return ret; -+} -+ -+static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) -+{ -+ struct six_lock_waiter *w, *next; -+ struct task_struct *task; -+ bool saw_one; -+ int ret; -+again: -+ ret = 0; -+ saw_one = false; -+ raw_spin_lock(&lock->wait_lock); -+ -+ list_for_each_entry_safe(w, next, &lock->wait_list, list) { -+ if (w->lock_want != lock_type) -+ continue; -+ -+ if (saw_one && lock_type != SIX_LOCK_read) -+ goto unlock; -+ saw_one = true; -+ -+ ret = __do_six_trylock(lock, lock_type, w->task, false); -+ if (ret <= 0) -+ goto unlock; -+ -+ __list_del(w->list.prev, w->list.next); -+ task = w->task; -+ /* -+ * Do no writes to @w besides setting lock_acquired - otherwise -+ * we would need a memory barrier: -+ */ -+ barrier(); -+ w->lock_acquired = true; -+ wake_up_process(task); -+ } -+ -+ six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type); -+unlock: -+ raw_spin_unlock(&lock->wait_lock); -+ -+ if (ret < 0) { -+ lock_type = -ret - 1; -+ goto again; -+ } -+} -+ -+__always_inline -+static void six_lock_wakeup(struct six_lock *lock, u32 state, -+ enum six_lock_type lock_type) -+{ -+ if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read)) -+ return; -+ -+ if (!(state & (SIX_LOCK_WAITING_read << lock_type))) -+ return; -+ -+ __six_lock_wakeup(lock, lock_type); -+} -+ -+__always_inline -+static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try) -+{ -+ int ret; -+ -+ ret = __do_six_trylock(lock, type, current, try); -+ if (ret < 0) -+ __six_lock_wakeup(lock, -ret - 1); -+ -+ return ret > 0; -+} -+ -+/** -+ * six_trylock_ip - attempt to take a six lock without blocking -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ -+ * -+ * Return: true on success, false on failure. -+ */ -+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) -+{ -+ if (!do_six_trylock(lock, type, true)) -+ return false; -+ -+ if (type != SIX_LOCK_write) -+ six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); -+ return true; -+} -+EXPORT_SYMBOL_GPL(six_trylock_ip); -+ -+/** -+ * six_relock_ip - attempt to re-take a lock that was held previously -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @seq: lock sequence number obtained from six_lock_seq() while lock was -+ * held previously -+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ -+ * -+ * Return: true on success, false on failure. -+ */ -+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, -+ unsigned seq, unsigned long ip) -+{ -+ if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip)) -+ return false; -+ -+ if (six_lock_seq(lock) != seq) { -+ six_unlock_ip(lock, type, ip); -+ return false; -+ } -+ -+ return true; -+} -+EXPORT_SYMBOL_GPL(six_relock_ip); -+ -+#ifdef CONFIG_LOCK_SPIN_ON_OWNER -+ -+static inline bool six_can_spin_on_owner(struct six_lock *lock) -+{ -+ struct task_struct *owner; -+ bool ret; -+ -+ if (need_resched()) -+ return false; -+ -+ rcu_read_lock(); -+ owner = READ_ONCE(lock->owner); -+ ret = !owner || owner_on_cpu(owner); -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+static inline bool six_spin_on_owner(struct six_lock *lock, -+ struct task_struct *owner, -+ u64 end_time) -+{ -+ bool ret = true; -+ unsigned loop = 0; -+ -+ rcu_read_lock(); -+ while (lock->owner == owner) { -+ /* -+ * Ensure we emit the owner->on_cpu, dereference _after_ -+ * checking lock->owner still matches owner. If that fails, -+ * owner might point to freed memory. If it still matches, -+ * the rcu_read_lock() ensures the memory stays valid. -+ */ -+ barrier(); -+ -+ if (!owner_on_cpu(owner) || need_resched()) { -+ ret = false; -+ break; -+ } -+ -+ if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { -+ six_set_bitmask(lock, SIX_LOCK_NOSPIN); -+ ret = false; -+ break; -+ } -+ -+ cpu_relax(); -+ } -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -+{ -+ struct task_struct *task = current; -+ u64 end_time; -+ -+ if (type == SIX_LOCK_write) -+ return false; -+ -+ preempt_disable(); -+ if (!six_can_spin_on_owner(lock)) -+ goto fail; -+ -+ if (!osq_lock(&lock->osq)) -+ goto fail; -+ -+ end_time = sched_clock() + 10 * NSEC_PER_USEC; -+ -+ while (1) { -+ struct task_struct *owner; -+ -+ /* -+ * If there's an owner, wait for it to either -+ * release the lock or go to sleep. -+ */ -+ owner = READ_ONCE(lock->owner); -+ if (owner && !six_spin_on_owner(lock, owner, end_time)) -+ break; -+ -+ if (do_six_trylock(lock, type, false)) { -+ osq_unlock(&lock->osq); -+ preempt_enable(); -+ return true; -+ } -+ -+ /* -+ * When there's no owner, we might have preempted between the -+ * owner acquiring the lock and setting the owner field. If -+ * we're an RT task that will live-lock because we won't let -+ * the owner complete. -+ */ -+ if (!owner && (need_resched() || rt_task(task))) -+ break; -+ -+ /* -+ * The cpu_relax() call is a compiler barrier which forces -+ * everything in this loop to be re-loaded. We don't need -+ * memory barriers as we'll eventually observe the right -+ * values at the cost of a few extra spins. -+ */ -+ cpu_relax(); -+ } -+ -+ osq_unlock(&lock->osq); -+fail: -+ preempt_enable(); -+ -+ /* -+ * If we fell out of the spin path because of need_resched(), -+ * reschedule now, before we try-lock again. This avoids getting -+ * scheduled out right after we obtained the lock. -+ */ -+ if (need_resched()) -+ schedule(); -+ -+ return false; -+} -+ -+#else /* CONFIG_LOCK_SPIN_ON_OWNER */ -+ -+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -+{ -+ return false; -+} -+ -+#endif -+ -+noinline -+static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, -+ struct six_lock_waiter *wait, -+ six_lock_should_sleep_fn should_sleep_fn, void *p, -+ unsigned long ip) -+{ -+ int ret = 0; -+ -+ if (type == SIX_LOCK_write) { -+ EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write); -+ atomic_add(SIX_LOCK_HELD_write, &lock->state); -+ smp_mb__after_atomic(); -+ } -+ -+ trace_contention_begin(lock, 0); -+ lock_contended(&lock->dep_map, ip); -+ -+ if (six_optimistic_spin(lock, type)) -+ goto out; -+ -+ wait->task = current; -+ wait->lock_want = type; -+ wait->lock_acquired = false; -+ -+ raw_spin_lock(&lock->wait_lock); -+ six_set_bitmask(lock, SIX_LOCK_WAITING_read << type); -+ /* -+ * Retry taking the lock after taking waitlist lock, in case we raced -+ * with an unlock: -+ */ -+ ret = __do_six_trylock(lock, type, current, false); -+ if (ret <= 0) { -+ wait->start_time = local_clock(); -+ -+ if (!list_empty(&lock->wait_list)) { -+ struct six_lock_waiter *last = -+ list_last_entry(&lock->wait_list, -+ struct six_lock_waiter, list); -+ -+ if (time_before_eq64(wait->start_time, last->start_time)) -+ wait->start_time = last->start_time + 1; -+ } -+ -+ list_add_tail(&wait->list, &lock->wait_list); -+ } -+ raw_spin_unlock(&lock->wait_lock); -+ -+ if (unlikely(ret > 0)) { -+ ret = 0; -+ goto out; -+ } -+ -+ if (unlikely(ret < 0)) { -+ __six_lock_wakeup(lock, -ret - 1); -+ ret = 0; -+ } -+ -+ while (1) { -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ -+ if (wait->lock_acquired) -+ break; -+ -+ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; -+ if (unlikely(ret)) { -+ raw_spin_lock(&lock->wait_lock); -+ if (!wait->lock_acquired) -+ list_del(&wait->list); -+ raw_spin_unlock(&lock->wait_lock); -+ -+ if (unlikely(wait->lock_acquired)) -+ do_six_unlock_type(lock, type); -+ break; -+ } -+ -+ schedule(); -+ } -+ -+ __set_current_state(TASK_RUNNING); -+out: -+ if (ret && type == SIX_LOCK_write) { -+ six_clear_bitmask(lock, SIX_LOCK_HELD_write); -+ six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); -+ } -+ trace_contention_end(lock, 0); -+ -+ return ret; -+} -+ -+/** -+ * six_lock_ip_waiter - take a lock, with full waitlist interface -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @wait: pointer to wait object, which will be added to lock's waitlist -+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior -+ * to scheduling -+ * @p: passed through to @should_sleep_fn -+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ -+ * -+ * This is the most general six_lock() variant, with parameters to support full -+ * cycle detection for deadlock avoidance. -+ * -+ * The code calling this function must implement tracking of held locks, and the -+ * @wait object should be embedded into the struct that tracks held locks - -+ * which must also be accessible in a thread-safe way. -+ * -+ * @should_sleep_fn should invoke the cycle detector; it should walk each -+ * lock's waiters, and for each waiter recursively walk their held locks. -+ * -+ * When this function must block, @wait will be added to @lock's waitlist before -+ * calling trylock, and before calling @should_sleep_fn, and @wait will not be -+ * removed from the lock waitlist until the lock has been successfully acquired, -+ * or we abort. -+ * -+ * @wait.start_time will be monotonically increasing for any given waitlist, and -+ * thus may be used as a loop cursor. -+ * -+ * Return: 0 on success, or the return code from @should_sleep_fn on failure. -+ */ -+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, -+ struct six_lock_waiter *wait, -+ six_lock_should_sleep_fn should_sleep_fn, void *p, -+ unsigned long ip) -+{ -+ int ret; -+ -+ wait->start_time = 0; -+ -+ if (type != SIX_LOCK_write) -+ six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip); -+ -+ ret = do_six_trylock(lock, type, true) ? 0 -+ : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip); -+ -+ if (ret && type != SIX_LOCK_write) -+ six_release(&lock->dep_map, ip); -+ if (!ret) -+ lock_acquired(&lock->dep_map, ip); -+ -+ return ret; -+} -+EXPORT_SYMBOL_GPL(six_lock_ip_waiter); -+ -+__always_inline -+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ u32 state; -+ -+ if (type == SIX_LOCK_intent) -+ lock->owner = NULL; -+ -+ if (type == SIX_LOCK_read && -+ lock->readers) { -+ smp_mb(); /* unlock barrier */ -+ this_cpu_dec(*lock->readers); -+ smp_mb(); /* between unlocking and checking for waiters */ -+ state = atomic_read(&lock->state); -+ } else { -+ u32 v = l[type].lock_val; -+ -+ if (type != SIX_LOCK_read) -+ v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN; -+ -+ EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask)); -+ state = atomic_sub_return_release(v, &lock->state); -+ } -+ -+ six_lock_wakeup(lock, state, l[type].unlock_wakeup); -+} -+ -+/** -+ * six_unlock_ip - drop a six lock -+ * @lock: lock to unlock -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ -+ * -+ * When a lock is held multiple times (because six_lock_incement()) was used), -+ * this decrements the 'lock held' counter by one. -+ * -+ * For example: -+ * six_lock_read(&foo->lock); read count 1 -+ * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 -+ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 -+ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 -+ */ -+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) -+{ -+ EBUG_ON(type == SIX_LOCK_write && -+ !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); -+ EBUG_ON((type == SIX_LOCK_write || -+ type == SIX_LOCK_intent) && -+ lock->owner != current); -+ -+ if (type != SIX_LOCK_write) -+ six_release(&lock->dep_map, ip); -+ else -+ lock->seq++; -+ -+ if (type == SIX_LOCK_intent && -+ lock->intent_lock_recurse) { -+ --lock->intent_lock_recurse; -+ return; -+ } -+ -+ do_six_unlock_type(lock, type); -+} -+EXPORT_SYMBOL_GPL(six_unlock_ip); -+ -+/** -+ * six_lock_downgrade - convert an intent lock to a read lock -+ * @lock: lock to dowgrade -+ * -+ * @lock will have read count incremented and intent count decremented -+ */ -+void six_lock_downgrade(struct six_lock *lock) -+{ -+ six_lock_increment(lock, SIX_LOCK_read); -+ six_unlock_intent(lock); -+} -+EXPORT_SYMBOL_GPL(six_lock_downgrade); -+ -+/** -+ * six_lock_tryupgrade - attempt to convert read lock to an intent lock -+ * @lock: lock to upgrade -+ * -+ * On success, @lock will have intent count incremented and read count -+ * decremented -+ * -+ * Return: true on success, false on failure -+ */ -+bool six_lock_tryupgrade(struct six_lock *lock) -+{ -+ u32 old = atomic_read(&lock->state), new; -+ -+ do { -+ new = old; -+ -+ if (new & SIX_LOCK_HELD_intent) -+ return false; -+ -+ if (!lock->readers) { -+ EBUG_ON(!(new & SIX_LOCK_HELD_read)); -+ new -= l[SIX_LOCK_read].lock_val; -+ } -+ -+ new |= SIX_LOCK_HELD_intent; -+ } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new)); -+ -+ if (lock->readers) -+ this_cpu_dec(*lock->readers); -+ -+ six_set_owner(lock, SIX_LOCK_intent, old, current); -+ -+ return true; -+} -+EXPORT_SYMBOL_GPL(six_lock_tryupgrade); -+ -+/** -+ * six_trylock_convert - attempt to convert a held lock from one type to another -+ * @lock: lock to upgrade -+ * @from: SIX_LOCK_read or SIX_LOCK_intent -+ * @to: SIX_LOCK_read or SIX_LOCK_intent -+ * -+ * On success, @lock will have intent count incremented and read count -+ * decremented -+ * -+ * Return: true on success, false on failure -+ */ -+bool six_trylock_convert(struct six_lock *lock, -+ enum six_lock_type from, -+ enum six_lock_type to) -+{ -+ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); -+ -+ if (to == from) -+ return true; -+ -+ if (to == SIX_LOCK_read) { -+ six_lock_downgrade(lock); -+ return true; -+ } else { -+ return six_lock_tryupgrade(lock); -+ } -+} -+EXPORT_SYMBOL_GPL(six_trylock_convert); -+ -+/** -+ * six_lock_increment - increase held lock count on a lock that is already held -+ * @lock: lock to increment -+ * @type: SIX_LOCK_read or SIX_LOCK_intent -+ * -+ * @lock must already be held, with a lock type that is greater than or equal to -+ * @type -+ * -+ * A corresponding six_unlock_type() call will be required for @lock to be fully -+ * unlocked. -+ */ -+void six_lock_increment(struct six_lock *lock, enum six_lock_type type) -+{ -+ six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_); -+ -+ /* XXX: assert already locked, and that we don't overflow: */ -+ -+ switch (type) { -+ case SIX_LOCK_read: -+ if (lock->readers) { -+ this_cpu_inc(*lock->readers); -+ } else { -+ EBUG_ON(!(atomic_read(&lock->state) & -+ (SIX_LOCK_HELD_read| -+ SIX_LOCK_HELD_intent))); -+ atomic_add(l[type].lock_val, &lock->state); -+ } -+ break; -+ case SIX_LOCK_intent: -+ EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); -+ lock->intent_lock_recurse++; -+ break; -+ case SIX_LOCK_write: -+ BUG(); -+ break; -+ } -+} -+EXPORT_SYMBOL_GPL(six_lock_increment); -+ -+/** -+ * six_lock_wakeup_all - wake up all waiters on @lock -+ * @lock: lock to wake up waiters for -+ * -+ * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then -+ * abort the lock operation. -+ * -+ * This function is never needed in a bug-free program; it's only useful in -+ * debug code, e.g. to determine if a cycle detector is at fault. -+ */ -+void six_lock_wakeup_all(struct six_lock *lock) -+{ -+ u32 state = atomic_read(&lock->state); -+ struct six_lock_waiter *w; -+ -+ six_lock_wakeup(lock, state, SIX_LOCK_read); -+ six_lock_wakeup(lock, state, SIX_LOCK_intent); -+ six_lock_wakeup(lock, state, SIX_LOCK_write); -+ -+ raw_spin_lock(&lock->wait_lock); -+ list_for_each_entry(w, &lock->wait_list, list) -+ wake_up_process(w->task); -+ raw_spin_unlock(&lock->wait_lock); -+} -+EXPORT_SYMBOL_GPL(six_lock_wakeup_all); -+ -+/** -+ * six_lock_counts - return held lock counts, for each lock type -+ * @lock: lock to return counters for -+ * -+ * Return: the number of times a lock is held for read, intent and write. -+ */ -+struct six_lock_count six_lock_counts(struct six_lock *lock) -+{ -+ struct six_lock_count ret; -+ -+ ret.n[SIX_LOCK_read] = !lock->readers -+ ? atomic_read(&lock->state) & SIX_LOCK_HELD_read -+ : pcpu_read_count(lock); -+ ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) + -+ lock->intent_lock_recurse; -+ ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write); -+ -+ return ret; -+} -+EXPORT_SYMBOL_GPL(six_lock_counts); -+ -+/** -+ * six_lock_readers_add - directly manipulate reader count of a lock -+ * @lock: lock to add/subtract readers for -+ * @nr: reader count to add/subtract -+ * -+ * When an upper layer is implementing lock reentrency, we may have both read -+ * and intent locks on the same lock. -+ * -+ * When we need to take a write lock, the read locks will cause self-deadlock, -+ * because six locks themselves do not track which read locks are held by the -+ * current thread and which are held by a different thread - it does no -+ * per-thread tracking of held locks. -+ * -+ * The upper layer that is tracking held locks may however, if trylock() has -+ * failed, count up its own read locks, subtract them, take the write lock, and -+ * then re-add them. -+ * -+ * As in any other situation when taking a write lock, @lock must be held for -+ * intent one (or more) times, so @lock will never be left unlocked. -+ */ -+void six_lock_readers_add(struct six_lock *lock, int nr) -+{ -+ if (lock->readers) { -+ this_cpu_add(*lock->readers, nr); -+ } else { -+ EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0); -+ /* reader count starts at bit 0 */ -+ atomic_add(nr, &lock->state); -+ } -+} -+EXPORT_SYMBOL_GPL(six_lock_readers_add); -+ -+/** -+ * six_lock_exit - release resources held by a lock prior to freeing -+ * @lock: lock to exit -+ * -+ * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is -+ * required to free the percpu read counts. -+ */ -+void six_lock_exit(struct six_lock *lock) -+{ -+ WARN_ON(lock->readers && pcpu_read_count(lock)); -+ WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read); -+ -+ free_percpu(lock->readers); -+ lock->readers = NULL; -+} -+EXPORT_SYMBOL_GPL(six_lock_exit); -+ -+void __six_lock_init(struct six_lock *lock, const char *name, -+ struct lock_class_key *key, enum six_lock_init_flags flags) -+{ -+ atomic_set(&lock->state, 0); -+ raw_spin_lock_init(&lock->wait_lock); -+ INIT_LIST_HEAD(&lock->wait_list); -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ debug_check_no_locks_freed((void *) lock, sizeof(*lock)); -+ lockdep_init_map(&lock->dep_map, name, key, 0); -+#endif -+ -+ /* -+ * Don't assume that we have real percpu variables available in -+ * userspace: -+ */ -+#ifdef __KERNEL__ -+ if (flags & SIX_LOCK_INIT_PCPU) { -+ /* -+ * We don't return an error here on memory allocation failure -+ * since percpu is an optimization, and locks will work with the -+ * same semantics in non-percpu mode: callers can check for -+ * failure if they wish by checking lock->readers, but generally -+ * will not want to treat it as an error. -+ */ -+ lock->readers = alloc_percpu(unsigned); -+ } -+#endif -+} -+EXPORT_SYMBOL_GPL(__six_lock_init); -diff --git a/kernel/module/main.c b/kernel/module/main.c -index 4e2cf784c..7f7b5bedf 100644 ---- a/kernel/module/main.c -+++ b/kernel/module/main.c -@@ -56,6 +56,7 @@ - #include - #include - #include -+#include - #include - #include - #include "internal.h" -@@ -1217,15 +1218,19 @@ static void *module_memory_alloc(unsigned int size, enum mod_mem_type type) - return module_alloc(size); - } - --static void module_memory_free(void *ptr, enum mod_mem_type type) -+static void module_memory_free(void *ptr, enum mod_mem_type type, -+ bool unload_codetags) - { -+ if (!unload_codetags && mod_mem_type_is_core_data(type)) -+ return; -+ - if (mod_mem_use_vmalloc(type)) - vfree(ptr); - else - module_memfree(ptr); - } - --static void free_mod_mem(struct module *mod) -+static void free_mod_mem(struct module *mod, bool unload_codetags) - { - for_each_mod_mem_type(type) { - struct module_memory *mod_mem = &mod->mem[type]; -@@ -1236,19 +1241,23 @@ static void free_mod_mem(struct module *mod) - /* Free lock-classes; relies on the preceding sync_rcu(). */ - lockdep_free_key_range(mod_mem->base, mod_mem->size); - if (mod_mem->size) -- module_memory_free(mod_mem->base, type); -+ module_memory_free(mod_mem->base, type, -+ unload_codetags); - } - - /* MOD_DATA hosts mod, so free it at last */ - lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size); -- module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA); -+ module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA, unload_codetags); - } - - /* Free a module, remove from lists, etc. */ - static void free_module(struct module *mod) - { -+ bool unload_codetags; -+ - trace_module_free(mod); - -+ unload_codetags = codetag_unload_module(mod); - mod_sysfs_teardown(mod); - - /* -@@ -1290,7 +1299,7 @@ static void free_module(struct module *mod) - kfree(mod->args); - percpu_modfree(mod); - -- free_mod_mem(mod); -+ free_mod_mem(mod, unload_codetags); - } - - void *__symbol_get(const char *symbol) -@@ -2292,7 +2301,7 @@ static int move_module(struct module *mod, struct load_info *info) - return 0; - out_enomem: - for (t--; t >= 0; t--) -- module_memory_free(mod->mem[t].base, t); -+ module_memory_free(mod->mem[t].base, t, true); - return ret; - } - -@@ -2422,7 +2431,7 @@ static void module_deallocate(struct module *mod, struct load_info *info) - percpu_modfree(mod); - module_arch_freeing_init(mod); - -- free_mod_mem(mod); -+ free_mod_mem(mod, true); - } - - int __weak module_finalize(const Elf_Ehdr *hdr, -@@ -2974,6 +2983,8 @@ static int load_module(struct load_info *info, const char __user *uargs, - /* Get rid of temporary copy. */ - free_copy(info, flags); - -+ codetag_load_module(mod); -+ - /* Done! */ - trace_module_load(mod); - diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9ed5ce989..4f6582487 100644 --- a/kernel/stacktrace.c @@ -98479,47 +97496,10 @@ index 5c2da561c..f78bc8b42 100644 bool depends on !NO_IOMEM diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index ce51d4dc6..a19ec6fd7 100644 +index d6798513a..69a3e33d1 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug -@@ -957,6 +957,36 @@ config DEBUG_STACKOVERFLOW - - If in doubt, say "N". - -+config CODE_TAGGING -+ bool -+ select KALLSYMS -+ -+config MEM_ALLOC_PROFILING -+ bool "Enable memory allocation profiling" -+ default n -+ depends on PROC_FS -+ select CODE_TAGGING -+ select PAGE_EXTENSION -+ select SLAB_OBJ_EXT -+ help -+ Track allocation source code and record total allocation size -+ initiated at that code location. The mechanism can be used to track -+ memory leaks with a low performance and memory impact. -+ -+config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT -+ bool "Enable memory allocation profiling by default" -+ default y -+ depends on MEM_ALLOC_PROFILING -+ -+config MEM_ALLOC_PROFILING_DEBUG -+ bool "Memory allocation profiler debugging" -+ default n -+ depends on MEM_ALLOC_PROFILING -+ select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT -+ help -+ Adds warnings with helpful error messages for memory allocation -+ profiling. -+ - source "lib/Kconfig.kasan" - source "lib/Kconfig.kfence" - source "lib/Kconfig.kmsan" -@@ -1637,6 +1667,15 @@ config DEBUG_NOTIFIERS +@@ -1710,6 +1710,15 @@ config DEBUG_NOTIFIERS This is a relatively cheap check but if you care about maximum performance, say N. @@ -98535,20 +97515,7 @@ index ce51d4dc6..a19ec6fd7 100644 config BUG_ON_DATA_CORRUPTION bool "Trigger a BUG when data corruption is detected" select DEBUG_LIST -@@ -1997,6 +2036,12 @@ config FAULT_INJECTION_STACKTRACE_FILTER - help - Provide stacktrace filter for fault-injection capabilities - -+config CODETAG_FAULT_INJECTION -+ bool "Code tagging based fault injection" -+ select CODE_TAGGING -+ help -+ Dynamic fault injection based on code tagging -+ - config ARCH_HAS_KCOV - bool - help -@@ -2123,6 +2168,15 @@ config CPUMASK_KUNIT_TEST +@@ -2196,6 +2205,15 @@ config CPUMASK_KUNIT_TEST If unsure, say N. @@ -98565,31 +97532,10 @@ index ce51d4dc6..a19ec6fd7 100644 tristate "Linked list sorting test" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/Makefile b/lib/Makefile -index 876fcdeae..fb1d20939 100644 +index 1ffae65bb..5ac5d72ba 100644 --- a/lib/Makefile +++ b/lib/Makefile -@@ -30,7 +30,7 @@ endif - lib-y := ctype.o string.o vsprintf.o cmdline.o \ - rbtree.o radix-tree.o timerqueue.o xarray.o \ - maple_tree.o idr.o extable.o irq_regs.o argv_split.o \ -- flex_proportions.o ratelimit.o show_mem.o \ -+ flex_proportions.o ratelimit.o \ - is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ - nmi_backtrace.o win_minmax.o memcat_p.o \ -@@ -226,6 +226,11 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \ - of-reconfig-notifier-error-inject.o - obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o - -+obj-$(CONFIG_CODE_TAGGING) += codetag.o -+obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o -+ -+obj-$(CONFIG_CODETAG_FAULT_INJECTION) += dynamic_fault.o -+ - lib-$(CONFIG_GENERIC_BUG) += bug.o - - obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o -@@ -248,6 +253,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o +@@ -254,6 +254,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o @@ -98598,245 +97544,14 @@ index 876fcdeae..fb1d20939 100644 obj-$(CONFIG_DQL) += dynamic_queue_limits.o obj-$(CONFIG_GLOB) += glob.o -diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c -new file mode 100644 -index 000000000..1ca90cff5 ---- /dev/null -+++ b/lib/alloc_tag.c -@@ -0,0 +1,225 @@ -+// SPDX-License-Identifier: GPL-2.0-only -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static struct codetag_type *alloc_tag_cttype; -+ -+DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, -+ mem_alloc_profiling_key); -+ -+static void *allocinfo_start(struct seq_file *m, loff_t *pos) -+{ -+ struct codetag_iterator *iter; -+ struct codetag *ct; -+ loff_t node = *pos; -+ -+ iter = kzalloc(sizeof(*iter), GFP_KERNEL); -+ m->private = iter; -+ if (!iter) -+ return NULL; -+ -+ codetag_lock_module_list(alloc_tag_cttype, true); -+ *iter = codetag_get_ct_iter(alloc_tag_cttype); -+ while ((ct = codetag_next_ct(iter)) != NULL && node) -+ node--; -+ -+ return ct ? iter : NULL; -+} -+ -+static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos) -+{ -+ struct codetag_iterator *iter = (struct codetag_iterator *)arg; -+ struct codetag *ct = codetag_next_ct(iter); -+ -+ (*pos)++; -+ if (!ct) -+ return NULL; -+ -+ return iter; -+} -+ -+static void allocinfo_stop(struct seq_file *m, void *arg) -+{ -+ struct codetag_iterator *iter = (struct codetag_iterator *)m->private; -+ -+ if (iter) { -+ codetag_lock_module_list(alloc_tag_cttype, false); -+ kfree(iter); -+ } -+} -+ -+static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct) -+{ -+ struct alloc_tag *tag = ct_to_alloc_tag(ct); -+ s64 bytes = alloc_tag_read(tag); -+ char val[10], *p = val; -+ -+ if (bytes < 0) { -+ *p++ = '-'; -+ bytes = -bytes; -+ } -+ -+ string_get_size(bytes, 1, -+ STRING_SIZE_BASE2|STRING_SIZE_NOSPACE, -+ p, val + ARRAY_SIZE(val) - p); -+ -+ seq_buf_printf(out, "%8s ", val); -+ codetag_to_text(out, ct); -+ seq_buf_putc(out, ' '); -+ seq_buf_putc(out, '\n'); -+} -+ -+static int allocinfo_show(struct seq_file *m, void *arg) -+{ -+ struct codetag_iterator *iter = (struct codetag_iterator *)arg; -+ char *bufp; -+ size_t n = seq_get_buf(m, &bufp); -+ struct seq_buf buf; -+ -+ seq_buf_init(&buf, bufp, n); -+ alloc_tag_to_text(&buf, iter->ct); -+ seq_commit(m, seq_buf_used(&buf)); -+ return 0; -+} -+ -+static const struct seq_operations allocinfo_seq_op = { -+ .start = allocinfo_start, -+ .next = allocinfo_next, -+ .stop = allocinfo_stop, -+ .show = allocinfo_show, -+}; -+ -+void alloc_tags_show_mem_report(struct seq_buf *s) -+{ -+ struct codetag_iterator iter; -+ struct codetag *ct; -+ struct { -+ struct codetag *tag; -+ size_t bytes; -+ } tags[10], n; -+ unsigned int i, nr = 0; -+ -+ codetag_lock_module_list(alloc_tag_cttype, true); -+ iter = codetag_get_ct_iter(alloc_tag_cttype); -+ while ((ct = codetag_next_ct(&iter))) { -+ n.tag = ct; -+ n.bytes = alloc_tag_read(ct_to_alloc_tag(ct)); -+ -+ for (i = 0; i < nr; i++) -+ if (n.bytes > tags[i].bytes) -+ break; -+ -+ if (i < ARRAY_SIZE(tags)) { -+ nr -= nr == ARRAY_SIZE(tags); -+ memmove(&tags[i + 1], -+ &tags[i], -+ sizeof(tags[0]) * (nr - i)); -+ nr++; -+ tags[i] = n; -+ } -+ } -+ -+ for (i = 0; i < nr; i++) -+ alloc_tag_to_text(s, tags[i].tag); -+ -+ codetag_lock_module_list(alloc_tag_cttype, false); -+} -+ -+static void __init procfs_init(void) -+{ -+ proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op); -+} -+ -+static void alloc_tag_module_load(struct codetag_type *cttype, struct codetag_module *cmod) -+{ -+ struct codetag_iterator iter = codetag_get_ct_iter(cttype); -+ struct codetag *ct; -+ -+ for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { -+ if (iter.cmod != cmod) -+ continue; -+ -+ ct_to_alloc_tag(ct)->bytes_allocated = alloc_percpu(u64); -+ } -+} -+ -+static bool alloc_tag_module_unload(struct codetag_type *cttype, struct codetag_module *cmod) -+{ -+ struct codetag_iterator iter = codetag_get_ct_iter(cttype); -+ bool module_unused = true; -+ struct alloc_tag *tag; -+ struct codetag *ct; -+ size_t bytes; -+ -+ for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { -+ if (iter.cmod != cmod) -+ continue; -+ -+ tag = ct_to_alloc_tag(ct); -+ bytes = alloc_tag_read(tag); -+ -+ if (!WARN(bytes, "%s:%u module %s func:%s has %zu allocated at module unload", -+ ct->filename, ct->lineno, ct->modname, ct->function, bytes)) -+ free_percpu(tag->bytes_allocated); -+ else -+ module_unused = false; -+ } -+ -+ return module_unused; -+} -+ -+static __init bool need_page_alloc_tagging(void) -+{ -+ return true; -+} -+ -+static __init void init_page_alloc_tagging(void) -+{ -+} -+ -+struct page_ext_operations page_alloc_tagging_ops = { -+ .size = sizeof(union codetag_ref), -+ .need = need_page_alloc_tagging, -+ .init = init_page_alloc_tagging, -+}; -+EXPORT_SYMBOL(page_alloc_tagging_ops); -+ -+static struct ctl_table memory_allocation_profiling_sysctls[] = { -+ { -+ .procname = "mem_profiling", -+ .data = &mem_alloc_profiling_key, -+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG -+ .mode = 0444, -+#else -+ .mode = 0644, -+#endif -+ .proc_handler = proc_do_static_key, -+ }, -+ { } -+}; -+ -+static int __init alloc_tag_init(void) -+{ -+ const struct codetag_type_desc desc = { -+ .section = "alloc_tags", -+ .tag_size = sizeof(struct alloc_tag), -+ .module_load = alloc_tag_module_load, -+ .module_unload = alloc_tag_module_unload, -+ }; -+ -+ alloc_tag_cttype = codetag_register_type(&desc); -+ if (IS_ERR_OR_NULL(alloc_tag_cttype)) -+ return PTR_ERR(alloc_tag_cttype); -+ -+ register_sysctl_init("vm", memory_allocation_profiling_sysctls); -+ procfs_init(); -+ -+ return 0; -+} -+module_init(alloc_tag_init); diff --git a/drivers/md/bcache/closure.c b/lib/closure.c -similarity index 88% +similarity index 85% rename from drivers/md/bcache/closure.c rename to lib/closure.c -index d8d9394a6..0855e698c 100644 +index d8d9394a6..2958169ce 100644 --- a/drivers/md/bcache/closure.c +++ b/lib/closure.c -@@ -6,13 +6,13 @@ +@@ -6,19 +6,20 @@ * Copyright 2012 Google, Inc. */ @@ -98853,7 +97568,16 @@ index d8d9394a6..0855e698c 100644 static inline void closure_put_after_sub(struct closure *cl, int flags) { int r = flags & CLOSURE_REMAINING_MASK; -@@ -45,6 +45,7 @@ void closure_sub(struct closure *cl, int v) + +- BUG_ON(flags & CLOSURE_GUARD_MASK); +- BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); ++ if ((flags & CLOSURE_GUARD_MASK) || ++ (!r && (flags & ~CLOSURE_DESTRUCTOR))) ++ panic("closure_put_after_sub: bogus flags %x remaining %i", flags, r); + + if (!r) { + if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { +@@ -45,6 +46,7 @@ void closure_sub(struct closure *cl, int v) { closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); } @@ -98861,7 +97585,7 @@ index d8d9394a6..0855e698c 100644 /* * closure_put - decrement a closure's refcount -@@ -53,6 +54,7 @@ void closure_put(struct closure *cl) +@@ -53,6 +55,7 @@ void closure_put(struct closure *cl) { closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); } @@ -98869,7 +97593,7 @@ index d8d9394a6..0855e698c 100644 /* * closure_wake_up - wake up all closures on a wait list, without memory barrier -@@ -74,6 +76,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list) +@@ -74,6 +77,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list) closure_sub(cl, CLOSURE_WAITING + 1); } } @@ -98877,7 +97601,7 @@ index d8d9394a6..0855e698c 100644 /** * closure_wait - add a closure to a waitlist -@@ -93,6 +96,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) +@@ -93,6 +97,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) return true; } @@ -98885,7 +97609,7 @@ index d8d9394a6..0855e698c 100644 struct closure_syncer { struct task_struct *task; -@@ -127,8 +131,9 @@ void __sched __closure_sync(struct closure *cl) +@@ -127,8 +132,9 @@ void __sched __closure_sync(struct closure *cl) __set_current_state(TASK_RUNNING); } @@ -98896,7 +97620,7 @@ index d8d9394a6..0855e698c 100644 static LIST_HEAD(closure_list); static DEFINE_SPINLOCK(closure_list_lock); -@@ -144,6 +149,7 @@ void closure_debug_create(struct closure *cl) +@@ -144,6 +150,7 @@ void closure_debug_create(struct closure *cl) list_add(&cl->all, &closure_list); spin_unlock_irqrestore(&closure_list_lock, flags); } @@ -98904,7 +97628,7 @@ index d8d9394a6..0855e698c 100644 void closure_debug_destroy(struct closure *cl) { -@@ -156,8 +162,7 @@ void closure_debug_destroy(struct closure *cl) +@@ -156,8 +163,7 @@ void closure_debug_destroy(struct closure *cl) list_del(&cl->all); spin_unlock_irqrestore(&closure_list_lock, flags); } @@ -98914,7 +97638,7 @@ index d8d9394a6..0855e698c 100644 static int debug_show(struct seq_file *f, void *data) { -@@ -181,7 +186,7 @@ static int debug_show(struct seq_file *f, void *data) +@@ -181,7 +187,7 @@ static int debug_show(struct seq_file *f, void *data) seq_printf(f, " W %pS\n", (void *) cl->waiting_on); @@ -98923,7 +97647,7 @@ index d8d9394a6..0855e698c 100644 } spin_unlock_irq(&closure_list_lock); -@@ -190,18 +195,11 @@ static int debug_show(struct seq_file *f, void *data) +@@ -190,18 +196,11 @@ static int debug_show(struct seq_file *f, void *data) DEFINE_SHOW_ATTRIBUTE(debug); @@ -98947,782 +97671,6 @@ index d8d9394a6..0855e698c 100644 -MODULE_AUTHOR("Kent Overstreet "); -MODULE_LICENSE("GPL"); +#endif -diff --git a/lib/codetag.c b/lib/codetag.c -new file mode 100644 -index 000000000..84f90f3b9 ---- /dev/null -+++ b/lib/codetag.c -@@ -0,0 +1,393 @@ -+// SPDX-License-Identifier: GPL-2.0-only -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+struct codetag_type { -+ struct list_head link; -+ unsigned int count; -+ struct idr mod_idr; -+ struct rw_semaphore mod_lock; /* protects mod_idr */ -+ struct codetag_type_desc desc; -+}; -+ -+static DEFINE_MUTEX(codetag_lock); -+static LIST_HEAD(codetag_types); -+ -+void codetag_lock_module_list(struct codetag_type *cttype, bool lock) -+{ -+ if (lock) -+ down_read(&cttype->mod_lock); -+ else -+ up_read(&cttype->mod_lock); -+} -+ -+struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype) -+{ -+ struct codetag_iterator iter = { -+ .cttype = cttype, -+ .cmod = NULL, -+ .mod_id = 0, -+ .ct = NULL, -+ }; -+ -+ return iter; -+} -+ -+static inline struct codetag *get_first_module_ct(struct codetag_module *cmod) -+{ -+ return cmod->range.start < cmod->range.stop ? cmod->range.start : NULL; -+} -+ -+static inline -+struct codetag *get_next_module_ct(struct codetag_iterator *iter) -+{ -+ struct codetag *res = (struct codetag *) -+ ((char *)iter->ct + iter->cttype->desc.tag_size); -+ -+ return res < iter->cmod->range.stop ? res : NULL; -+} -+ -+struct codetag *codetag_next_ct(struct codetag_iterator *iter) -+{ -+ struct codetag_type *cttype = iter->cttype; -+ struct codetag_module *cmod; -+ struct codetag *ct; -+ -+ lockdep_assert_held(&cttype->mod_lock); -+ -+ if (unlikely(idr_is_empty(&cttype->mod_idr))) -+ return NULL; -+ -+ ct = NULL; -+ while (true) { -+ cmod = idr_find(&cttype->mod_idr, iter->mod_id); -+ -+ /* If module was removed move to the next one */ -+ if (!cmod) -+ cmod = idr_get_next_ul(&cttype->mod_idr, -+ &iter->mod_id); -+ -+ /* Exit if no more modules */ -+ if (!cmod) -+ break; -+ -+ if (cmod != iter->cmod) { -+ iter->cmod = cmod; -+ ct = get_first_module_ct(cmod); -+ } else -+ ct = get_next_module_ct(iter); -+ -+ if (ct) -+ break; -+ -+ iter->mod_id++; -+ } -+ -+ iter->ct = ct; -+ return ct; -+} -+ -+void codetag_to_text(struct seq_buf *out, struct codetag *ct) -+{ -+ seq_buf_printf(out, "%s:%u module:%s func:%s", -+ ct->filename, ct->lineno, -+ ct->modname, ct->function); -+} -+ -+static inline size_t range_size(const struct codetag_type *cttype, -+ const struct codetag_range *range) -+{ -+ return ((char *)range->stop - (char *)range->start) / -+ cttype->desc.tag_size; -+} -+ -+static void *get_symbol(struct module *mod, const char *prefix, const char *name) -+{ -+ char buf[64]; -+ void *ret; -+ int res; -+ -+ res = snprintf(buf, sizeof(buf), "%s%s", prefix, name); -+ if (WARN_ON(res < 1 || res > sizeof(buf))) -+ return NULL; -+ -+ preempt_disable(); -+ ret = mod ? -+ (void *)find_kallsyms_symbol_value(mod, buf) : -+ (void *)kallsyms_lookup_name(buf); -+ preempt_enable(); -+ -+ return ret; -+} -+ -+static struct codetag_range get_section_range(struct module *mod, -+ const char *section) -+{ -+ return (struct codetag_range) { -+ get_symbol(mod, "__start_", section), -+ get_symbol(mod, "__stop_", section), -+ }; -+} -+ -+static int codetag_module_init(struct codetag_type *cttype, struct module *mod) -+{ -+ struct codetag_range range; -+ struct codetag_module *cmod; -+ int err; -+ -+ range = get_section_range(mod, cttype->desc.section); -+ if (!range.start || !range.stop) { -+ pr_warn("Failed to load code tags of type %s from the module %s\n", -+ cttype->desc.section, -+ mod ? mod->name : "(built-in)"); -+ return -EINVAL; -+ } -+ -+ /* Ignore empty ranges */ -+ if (range.start == range.stop) -+ return 0; -+ -+ BUG_ON(range.start > range.stop); -+ -+ cmod = kmalloc(sizeof(*cmod), GFP_KERNEL); -+ if (unlikely(!cmod)) -+ return -ENOMEM; -+ -+ cmod->mod = mod; -+ cmod->range = range; -+ -+ down_write(&cttype->mod_lock); -+ err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL); -+ if (err >= 0) { -+ cttype->count += range_size(cttype, &range); -+ if (cttype->desc.module_load) -+ cttype->desc.module_load(cttype, cmod); -+ } -+ up_write(&cttype->mod_lock); -+ -+ if (err < 0) { -+ kfree(cmod); -+ return err; -+ } -+ -+ return 0; -+} -+ -+struct codetag_type * -+codetag_register_type(const struct codetag_type_desc *desc) -+{ -+ struct codetag_type *cttype; -+ int err; -+ -+ BUG_ON(desc->tag_size <= 0); -+ -+ cttype = kzalloc(sizeof(*cttype), GFP_KERNEL); -+ if (unlikely(!cttype)) -+ return ERR_PTR(-ENOMEM); -+ -+ cttype->desc = *desc; -+ idr_init(&cttype->mod_idr); -+ init_rwsem(&cttype->mod_lock); -+ -+ err = codetag_module_init(cttype, NULL); -+ if (unlikely(err)) { -+ kfree(cttype); -+ return ERR_PTR(err); -+ } -+ -+ mutex_lock(&codetag_lock); -+ list_add_tail(&cttype->link, &codetag_types); -+ mutex_unlock(&codetag_lock); -+ -+ return cttype; -+} -+ -+void codetag_load_module(struct module *mod) -+{ -+ struct codetag_type *cttype; -+ -+ if (!mod) -+ return; -+ -+ mutex_lock(&codetag_lock); -+ list_for_each_entry(cttype, &codetag_types, link) -+ codetag_module_init(cttype, mod); -+ mutex_unlock(&codetag_lock); -+} -+ -+bool codetag_unload_module(struct module *mod) -+{ -+ struct codetag_type *cttype; -+ bool unload_ok = true; -+ -+ if (!mod) -+ return true; -+ -+ mutex_lock(&codetag_lock); -+ list_for_each_entry(cttype, &codetag_types, link) { -+ struct codetag_module *found = NULL; -+ struct codetag_module *cmod; -+ unsigned long mod_id, tmp; -+ -+ down_write(&cttype->mod_lock); -+ idr_for_each_entry_ul(&cttype->mod_idr, cmod, tmp, mod_id) { -+ if (cmod->mod && cmod->mod == mod) { -+ found = cmod; -+ break; -+ } -+ } -+ if (found) { -+ if (cttype->desc.module_unload) -+ if (!cttype->desc.module_unload(cttype, cmod)) -+ unload_ok = false; -+ -+ cttype->count -= range_size(cttype, &cmod->range); -+ idr_remove(&cttype->mod_idr, mod_id); -+ kfree(cmod); -+ } -+ up_write(&cttype->mod_lock); -+ } -+ mutex_unlock(&codetag_lock); -+ -+ return unload_ok; -+} -+ -+/* Codetag query parsing */ -+ -+#define CODETAG_QUERY_TOKENS() \ -+ x(func) \ -+ x(file) \ -+ x(line) \ -+ x(module) \ -+ x(class) \ -+ x(index) -+ -+enum tokens { -+#define x(name) TOK_##name, -+ CODETAG_QUERY_TOKENS() -+#undef x -+}; -+ -+static const char * const token_strs[] = { -+#define x(name) #name, -+ CODETAG_QUERY_TOKENS() -+#undef x -+ NULL -+}; -+ -+static int parse_range(char *str, unsigned int *first, unsigned int *last) -+{ -+ char *first_str = str; -+ char *last_str = strchr(first_str, '-'); -+ -+ if (last_str) -+ *last_str++ = '\0'; -+ -+ if (kstrtouint(first_str, 10, first)) -+ return -EINVAL; -+ -+ if (!last_str) -+ *last = *first; -+ else if (kstrtouint(last_str, 10, last)) -+ return -EINVAL; -+ -+ return 0; -+} -+ -+char *codetag_query_parse(struct codetag_query *q, char *buf) -+{ -+ while (1) { -+ char *p = buf; -+ char *str1 = strsep_no_empty(&p, " \t\r\n"); -+ char *str2 = strsep_no_empty(&p, " \t\r\n"); -+ int ret, token; -+ -+ if (!str1 || !str2) -+ break; -+ -+ token = match_string(token_strs, ARRAY_SIZE(token_strs), str1); -+ if (token < 0) -+ break; -+ -+ switch (token) { -+ case TOK_func: -+ q->function = str2; -+ break; -+ case TOK_file: -+ q->filename = str2; -+ break; -+ case TOK_line: -+ ret = parse_range(str2, &q->first_line, &q->last_line); -+ if (ret) -+ return ERR_PTR(ret); -+ q->match_line = true; -+ break; -+ case TOK_module: -+ q->module = str2; -+ break; -+ case TOK_class: -+ q->class = str2; -+ break; -+ case TOK_index: -+ ret = parse_range(str2, &q->first_index, &q->last_index); -+ if (ret) -+ return ERR_PTR(ret); -+ q->match_index = true; -+ break; -+ } -+ -+ buf = p; -+ } -+ -+ return buf; -+} -+ -+bool codetag_matches_query(struct codetag_query *q, -+ const struct codetag *ct, -+ const struct codetag_module *mod, -+ const char *class) -+{ -+ size_t classlen = q->class ? strlen(q->class) : 0; -+ -+ if (q->module && -+ (!mod->mod || -+ strcmp(q->module, ct->modname))) -+ return false; -+ -+ if (q->filename && -+ strcmp(q->filename, ct->filename) && -+ strcmp(q->filename, kbasename(ct->filename))) -+ return false; -+ -+ if (q->function && -+ strcmp(q->function, ct->function)) -+ return false; -+ -+ /* match against the line number range */ -+ if (q->match_line && -+ (ct->lineno < q->first_line || -+ ct->lineno > q->last_line)) -+ return false; -+ -+ /* match against the class */ -+ if (classlen && -+ (strncmp(q->class, class, classlen) || -+ (class[classlen] && class[classlen] != ':'))) -+ return false; -+ -+ /* match against the fault index */ -+ if (q->match_index && -+ (q->cur_index < q->first_index || -+ q->cur_index > q->last_index)) { -+ q->cur_index++; -+ return false; -+ } -+ -+ q->cur_index++; -+ return true; -+} -diff --git a/lib/dynamic_fault.c b/lib/dynamic_fault.c -new file mode 100644 -index 000000000..c92374359 ---- /dev/null -+++ b/lib/dynamic_fault.c -@@ -0,0 +1,371 @@ -+// SPDX-License-Identifier: GPL-2.0-only -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static struct codetag_type *cttype; -+ -+bool __dynamic_fault_enabled(struct dfault *df) -+{ -+ union dfault_state old, new; -+ unsigned int v = df->state.v; -+ bool ret; -+ -+ do { -+ old.v = new.v = v; -+ -+ if (new.enabled == DFAULT_disabled) -+ return false; -+ -+ ret = df->frequency -+ ? ++new.count >= df->frequency -+ : true; -+ if (ret) -+ new.count = 0; -+ if (ret && new.enabled == DFAULT_oneshot) -+ new.enabled = DFAULT_disabled; -+ } while ((v = cmpxchg(&df->state.v, old.v, new.v)) != old.v); -+ -+ if (ret) -+ pr_debug("returned true for %s:%u", df->tag.filename, df->tag.lineno); -+ -+ return ret; -+} -+EXPORT_SYMBOL(__dynamic_fault_enabled); -+ -+static const char * const dfault_state_strs[] = { -+#define x(n) #n, -+ DFAULT_STATES() -+#undef x -+ NULL -+}; -+ -+static void dynamic_fault_to_text(struct seq_buf *out, struct dfault *df) -+{ -+ codetag_to_text(out, &df->tag); -+ seq_buf_printf(out, "class:%s %s \"", df->class, -+ dfault_state_strs[df->state.enabled]); -+} -+ -+struct dfault_query { -+ struct codetag_query q; -+ -+ bool set_enabled:1; -+ unsigned int enabled:2; -+ -+ bool set_frequency:1; -+ unsigned int frequency; -+}; -+ -+/* -+ * Search the tables for _dfault's which match the given -+ * `query' and apply the `flags' and `mask' to them. Tells -+ * the user which dfault's were changed, or whether none -+ * were matched. -+ */ -+static int dfault_change(struct dfault_query *query) -+{ -+ struct codetag_iterator ct_iter = codetag_get_ct_iter(cttype); -+ struct codetag *ct; -+ unsigned int nfound = 0; -+ -+ codetag_lock_module_list(cttype, true); -+ -+ while ((ct = codetag_next_ct(&ct_iter))) { -+ struct dfault *df = container_of(ct, struct dfault, tag); -+ -+ if (!codetag_matches_query(&query->q, ct, ct_iter.cmod, df->class)) -+ continue; -+ -+ if (query->set_enabled && -+ query->enabled != df->state.enabled) { -+ if (query->enabled != DFAULT_disabled) -+ static_key_slow_inc(&df->enabled.key); -+ else if (df->state.enabled != DFAULT_disabled) -+ static_key_slow_dec(&df->enabled.key); -+ -+ df->state.enabled = query->enabled; -+ } -+ -+ if (query->set_frequency) -+ df->frequency = query->frequency; -+ -+ pr_debug("changed %s:%d [%s]%s #%d %s", -+ df->tag.filename, df->tag.lineno, df->tag.modname, -+ df->tag.function, query->q.cur_index, -+ dfault_state_strs[df->state.enabled]); -+ -+ nfound++; -+ } -+ -+ pr_debug("dfault: %u matches", nfound); -+ -+ codetag_lock_module_list(cttype, false); -+ -+ return nfound ? 0 : -ENOENT; -+} -+ -+#define DFAULT_TOKENS() \ -+ x(disable, 0) \ -+ x(enable, 0) \ -+ x(oneshot, 0) \ -+ x(frequency, 1) -+ -+enum dfault_token { -+#define x(name, nr_args) TOK_##name, -+ DFAULT_TOKENS() -+#undef x -+}; -+ -+static const char * const dfault_token_strs[] = { -+#define x(name, nr_args) #name, -+ DFAULT_TOKENS() -+#undef x -+ NULL -+}; -+ -+static unsigned int dfault_token_nr_args[] = { -+#define x(name, nr_args) nr_args, -+ DFAULT_TOKENS() -+#undef x -+}; -+ -+static enum dfault_token str_to_token(const char *word, unsigned int nr_words) -+{ -+ int tok = match_string(dfault_token_strs, ARRAY_SIZE(dfault_token_strs), word); -+ -+ if (tok < 0) { -+ pr_debug("unknown keyword \"%s\"", word); -+ return tok; -+ } -+ -+ if (nr_words < dfault_token_nr_args[tok]) { -+ pr_debug("insufficient arguments to \"%s\"", word); -+ return -EINVAL; -+ } -+ -+ return tok; -+} -+ -+static int dfault_parse_command(struct dfault_query *query, -+ enum dfault_token tok, -+ char *words[], size_t nr_words) -+{ -+ unsigned int i = 0; -+ int ret; -+ -+ switch (tok) { -+ case TOK_disable: -+ query->set_enabled = true; -+ query->enabled = DFAULT_disabled; -+ break; -+ case TOK_enable: -+ query->set_enabled = true; -+ query->enabled = DFAULT_enabled; -+ break; -+ case TOK_oneshot: -+ query->set_enabled = true; -+ query->enabled = DFAULT_oneshot; -+ break; -+ case TOK_frequency: -+ query->set_frequency = 1; -+ ret = kstrtouint(words[i++], 10, &query->frequency); -+ if (ret) -+ return ret; -+ -+ if (!query->set_enabled) { -+ query->set_enabled = 1; -+ query->enabled = DFAULT_enabled; -+ } -+ break; -+ } -+ -+ return i; -+} -+ -+static int dynamic_fault_store(char *buf) -+{ -+ struct dfault_query query = { NULL }; -+#define MAXWORDS 9 -+ char *tok, *words[MAXWORDS]; -+ int ret, nr_words, i = 0; -+ -+ buf = codetag_query_parse(&query.q, buf); -+ if (IS_ERR(buf)) -+ return PTR_ERR(buf); -+ -+ while ((tok = strsep_no_empty(&buf, " \t\r\n"))) { -+ if (nr_words == ARRAY_SIZE(words)) -+ return -EINVAL; /* ran out of words[] before bytes */ -+ words[nr_words++] = tok; -+ } -+ -+ while (i < nr_words) { -+ const char *tok_str = words[i++]; -+ enum dfault_token tok = str_to_token(tok_str, nr_words - i); -+ -+ if (tok < 0) -+ return tok; -+ -+ ret = dfault_parse_command(&query, tok, words + i, nr_words - i); -+ if (ret < 0) -+ return ret; -+ -+ i += ret; -+ BUG_ON(i > nr_words); -+ } -+ -+ pr_debug("q->function=\"%s\" q->filename=\"%s\" " -+ "q->module=\"%s\" q->line=%u-%u\n q->index=%u-%u", -+ query.q.function, query.q.filename, query.q.module, -+ query.q.first_line, query.q.last_line, -+ query.q.first_index, query.q.last_index); -+ -+ ret = dfault_change(&query); -+ if (ret < 0) -+ return ret; -+ -+ return 0; -+} -+ -+struct dfault_iter { -+ struct codetag_iterator ct_iter; -+ -+ struct seq_buf buf; -+ char rawbuf[4096]; -+}; -+ -+static int dfault_open(struct inode *inode, struct file *file) -+{ -+ struct dfault_iter *iter; -+ -+ iter = kzalloc(sizeof(*iter), GFP_KERNEL); -+ if (!iter) -+ return -ENOMEM; -+ -+ codetag_lock_module_list(cttype, true); -+ iter->ct_iter = codetag_get_ct_iter(cttype); -+ codetag_lock_module_list(cttype, false); -+ -+ file->private_data = iter; -+ seq_buf_init(&iter->buf, iter->rawbuf, sizeof(iter->rawbuf)); -+ return 0; -+} -+ -+static int dfault_release(struct inode *inode, struct file *file) -+{ -+ struct dfault_iter *iter = file->private_data; -+ -+ kfree(iter); -+ return 0; -+} -+ -+struct user_buf { -+ char __user *buf; /* destination user buffer */ -+ size_t size; /* size of requested read */ -+ ssize_t ret; /* bytes read so far */ -+}; -+ -+static int flush_ubuf(struct user_buf *dst, struct seq_buf *src) -+{ -+ if (src->len) { -+ size_t bytes = min_t(size_t, src->len, dst->size); -+ int err = copy_to_user(dst->buf, src->buffer, bytes); -+ -+ if (err) -+ return err; -+ -+ dst->ret += bytes; -+ dst->buf += bytes; -+ dst->size -= bytes; -+ src->len -= bytes; -+ memmove(src->buffer, src->buffer + bytes, src->len); -+ } -+ -+ return 0; -+} -+ -+static ssize_t dfault_read(struct file *file, char __user *ubuf, -+ size_t size, loff_t *ppos) -+{ -+ struct dfault_iter *iter = file->private_data; -+ struct user_buf buf = { .buf = ubuf, .size = size }; -+ struct codetag *ct; -+ struct dfault *df; -+ int err; -+ -+ codetag_lock_module_list(iter->ct_iter.cttype, true); -+ while (1) { -+ err = flush_ubuf(&buf, &iter->buf); -+ if (err || !buf.size) -+ break; -+ -+ ct = codetag_next_ct(&iter->ct_iter); -+ if (!ct) -+ break; -+ -+ df = container_of(ct, struct dfault, tag); -+ dynamic_fault_to_text(&iter->buf, df); -+ seq_buf_putc(&iter->buf, '\n'); -+ } -+ codetag_lock_module_list(iter->ct_iter.cttype, false); -+ -+ return err ?: buf.ret; -+} -+ -+/* -+ * File_ops->write method for /dynamic_fault/conrol. Gathers the -+ * command text from userspace, parses and executes it. -+ */ -+static ssize_t dfault_write(struct file *file, const char __user *ubuf, -+ size_t len, loff_t *offp) -+{ -+ char tmpbuf[256]; -+ -+ if (len == 0) -+ return 0; -+ /* we don't check *offp -- multiple writes() are allowed */ -+ if (len > sizeof(tmpbuf)-1) -+ return -E2BIG; -+ if (copy_from_user(tmpbuf, ubuf, len)) -+ return -EFAULT; -+ tmpbuf[len] = '\0'; -+ pr_debug("read %zu bytes from userspace", len); -+ -+ dynamic_fault_store(tmpbuf); -+ -+ *offp += len; -+ return len; -+} -+ -+static const struct file_operations dfault_ops = { -+ .owner = THIS_MODULE, -+ .open = dfault_open, -+ .release = dfault_release, -+ .read = dfault_read, -+ .write = dfault_write -+}; -+ -+static int __init dynamic_fault_init(void) -+{ -+ const struct codetag_type_desc desc = { -+ .section = "dynamic_fault_tags", -+ .tag_size = sizeof(struct dfault), -+ }; -+ struct dentry *debugfs_file; -+ -+ cttype = codetag_register_type(&desc); -+ if (IS_ERR_OR_NULL(cttype)) -+ return PTR_ERR(cttype); -+ -+ debugfs_file = debugfs_create_file("dynamic_faults", 0666, NULL, NULL, &dfault_ops); -+ if (IS_ERR(debugfs_file)) -+ return PTR_ERR(debugfs_file); -+ -+ return 0; -+} -+module_init(dynamic_fault_init); diff --git a/lib/errname.c b/lib/errname.c index 67739b174..dd1b99855 100644 --- a/lib/errname.c @@ -99840,10 +97788,10 @@ index f25eb111c..41f1bcdc4 100644 { if (level) { diff --git a/lib/iov_iter.c b/lib/iov_iter.c -index 960223ed9..f9c4bba27 100644 +index e4dc809d1..eb3dffb24 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c -@@ -857,24 +857,37 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i) +@@ -566,24 +566,37 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i) } EXPORT_SYMBOL(iov_iter_zero); @@ -100330,49 +98278,10 @@ index 000000000..f45591a16 +MODULE_AUTHOR("Daniel B. Hill"); +MODULE_LICENSE("GPL"); diff --git a/lib/rhashtable.c b/lib/rhashtable.c -index 6ae2ba8e0..76e5bf9be 100644 +index 6ae2ba8e0..d3fce9c89 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c -@@ -130,7 +130,7 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht, - if (ntbl) - return ntbl; - -- ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC); -+ ntbl = kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO); - - if (ntbl && leaf) { - for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++) -@@ -157,7 +157,7 @@ static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, - - size = sizeof(*tbl) + sizeof(tbl->buckets[0]); - -- tbl = kzalloc(size, gfp); -+ tbl = kmalloc_noprof(size, gfp|__GFP_ZERO); - if (!tbl) - return NULL; - -@@ -180,8 +180,10 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, - size_t size; - int i; - static struct lock_class_key __key; -+ struct alloc_tag * __maybe_unused old = alloc_tag_save(ht->alloc_tag); - -- tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp); -+ tbl = kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets), -+ gfp|__GFP_ZERO, NUMA_NO_NODE); - - size = nbuckets; - -@@ -190,6 +192,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, - nbuckets = 0; - } - -+ alloc_tag_restore(ht->alloc_tag, old); -+ - if (tbl == NULL) - return NULL; - -@@ -360,9 +364,14 @@ static int rhashtable_rehash_alloc(struct rhashtable *ht, +@@ -360,9 +360,14 @@ static int rhashtable_rehash_alloc(struct rhashtable *ht, ASSERT_RHT_MUTEX(ht); @@ -100389,85 +98298,6 @@ index 6ae2ba8e0..76e5bf9be 100644 err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); if (err) -@@ -975,7 +984,7 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) - } - - /** -- * rhashtable_init - initialize a new hash table -+ * rhashtable_init_noprof - initialize a new hash table - * @ht: hash table to be initialized - * @params: configuration parameters - * -@@ -1016,7 +1025,7 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) - * .obj_hashfn = my_hash_fn, - * }; - */ --int rhashtable_init(struct rhashtable *ht, -+int rhashtable_init_noprof(struct rhashtable *ht, - const struct rhashtable_params *params) - { - struct bucket_table *tbl; -@@ -1031,6 +1040,10 @@ int rhashtable_init(struct rhashtable *ht, - spin_lock_init(&ht->lock); - memcpy(&ht->p, params, sizeof(*params)); - -+#ifdef CONFIG_MEM_ALLOC_PROFILING -+ ht->alloc_tag = current->alloc_tag; -+#endif -+ - if (params->min_size) - ht->p.min_size = roundup_pow_of_two(params->min_size); - -@@ -1076,26 +1089,26 @@ int rhashtable_init(struct rhashtable *ht, - - return 0; - } --EXPORT_SYMBOL_GPL(rhashtable_init); -+EXPORT_SYMBOL_GPL(rhashtable_init_noprof); - - /** -- * rhltable_init - initialize a new hash list table -+ * rhltable_init_noprof - initialize a new hash list table - * @hlt: hash list table to be initialized - * @params: configuration parameters - * - * Initializes a new hash list table. - * -- * See documentation for rhashtable_init. -+ * See documentation for rhashtable_init_noprof. - */ --int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) -+int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params) - { - int err; - -- err = rhashtable_init(&hlt->ht, params); -+ err = rhashtable_init_noprof(&hlt->ht, params); - hlt->ht.rhlist = true; - return err; - } --EXPORT_SYMBOL_GPL(rhltable_init); -+EXPORT_SYMBOL_GPL(rhltable_init_noprof); - - static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, - void (*free_fn)(void *ptr, void *arg), -@@ -1222,6 +1235,7 @@ struct rhash_lock_head __rcu **rht_bucket_nested_insert( - unsigned int index = hash & ((1 << tbl->nest) - 1); - unsigned int size = tbl->size >> tbl->nest; - union nested_table *ntbl; -+ struct alloc_tag * __maybe_unused old = alloc_tag_save(ht->alloc_tag); - - ntbl = nested_table_top(tbl); - hash >>= tbl->nest; -@@ -1236,6 +1250,8 @@ struct rhash_lock_head __rcu **rht_bucket_nested_insert( - size <= (1 << shift)); - } - -+ alloc_tag_restore(ht->alloc_tag, old); -+ - if (!ntbl) - return NULL; - diff --git a/lib/seq_buf.c b/lib/seq_buf.c index 45c450f42..2b87e9219 100644 --- a/lib/seq_buf.c @@ -100486,38 +98316,8 @@ index 45c450f42..2b87e9219 100644 + seq_buf_commit(s, wrote); +} +EXPORT_SYMBOL(seq_buf_human_readable_u64); -diff --git a/lib/string.c b/lib/string.c -index 3d55ef890..dd4914baf 100644 ---- a/lib/string.c -+++ b/lib/string.c -@@ -520,6 +520,25 @@ char *strsep(char **s, const char *ct) - EXPORT_SYMBOL(strsep); - #endif - -+/** -+ * strsep_no_empt - Split a string into tokens, but don't return empty tokens -+ * @s: The string to be searched -+ * @ct: The characters to search for -+ * -+ * strsep() updates @s to point after the token, ready for the next call. -+ */ -+char *strsep_no_empty(char **s, const char *ct) -+{ -+ char *ret; -+ -+ do { -+ ret = strsep(s, ct); -+ } while (ret && !*ret); -+ -+ return ret; -+} -+EXPORT_SYMBOL_GPL(strsep_no_empty); -+ - #ifndef __HAVE_ARCH_MEMSET - /** - * memset - Fill a region of memory with the given value diff --git a/lib/string_helpers.c b/lib/string_helpers.c -index 230020a2e..d527ce455 100644 +index d3b1dd718..c29dd105b 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -19,11 +19,17 @@ @@ -100591,97 +98391,11 @@ index 9a68849a5..0b01ffca9 100644 test_string_get_size_check("STRING_UNITS_10", exp_result10, buf10, size, blk_size); -diff --git a/mm/Makefile b/mm/Makefile -index e29afc890..e2ecfe0ea 100644 ---- a/mm/Makefile -+++ b/mm/Makefile -@@ -53,7 +53,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ - mm_init.o percpu.o slab_common.o \ - compaction.o \ - interval_tree.o list_lru.o workingset.o \ -- debug.o gup.o mmap_lock.o $(mmu-y) -+ debug.o gup.o mmap_lock.o show_mem.o $(mmu-y) - - # Give 'page_alloc' its own module-parameter namespace - page-alloc-y := page_alloc.o -diff --git a/mm/compaction.c b/mm/compaction.c -index c8bcdea15..09dd56a94 100644 ---- a/mm/compaction.c -+++ b/mm/compaction.c -@@ -1684,8 +1684,8 @@ static void isolate_freepages(struct compact_control *cc) - * This is a migrate-callback that "allocates" freepages by taking pages - * from the isolated freelists in the block we are migrating to. - */ --static struct page *compaction_alloc(struct page *migratepage, -- unsigned long data) -+static struct page *compaction_alloc_noprof(struct page *migratepage, -+ unsigned long data) - { - struct compact_control *cc = (struct compact_control *)data; - struct page *freepage; -@@ -1704,6 +1704,12 @@ static struct page *compaction_alloc(struct page *migratepage, - return freepage; - } - -+static struct page *compaction_alloc(struct page *migratepage, -+ unsigned long data) -+{ -+ return alloc_hooks(compaction_alloc_noprof(migratepage, data)); -+} -+ - /* - * This is a migrate-callback that "frees" freepages back to the isolated - * freelist. All pages on the freelist are from the same zone, so there is no -diff --git a/mm/filemap.c b/mm/filemap.c -index 8abce63b2..e38eec523 100644 ---- a/mm/filemap.c -+++ b/mm/filemap.c -@@ -958,7 +958,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, - EXPORT_SYMBOL_GPL(filemap_add_folio); - - #ifdef CONFIG_NUMA --struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) -+struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) - { - int n; - struct folio *folio; -@@ -973,9 +973,9 @@ struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) - - return folio; - } -- return folio_alloc(gfp, order); -+ return folio_alloc_noprof(gfp, order); - } --EXPORT_SYMBOL(filemap_alloc_folio); -+EXPORT_SYMBOL(filemap_alloc_folio_noprof); - #endif - - /* -diff --git a/mm/huge_memory.c b/mm/huge_memory.c -index 624671aaa..221cce005 100644 ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -37,6 +37,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -2557,6 +2558,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, - /* Caller disabled irqs, so they are still disabled here */ - - split_page_owner(head, nr); -+ pgalloc_tag_split(head, nr); - - /* See comment in __split_huge_page_tail() */ - if (PageAnon(head)) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c -index f791076da..3e5a604ee 100644 +index 6da626bfb..4165e22b0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c -@@ -3246,7 +3246,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) +@@ -3270,7 +3270,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) if (i == h->max_huge_pages_node[nid]) return; @@ -100690,7 +98404,7 @@ index f791076da..3e5a604ee 100644 pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", h->max_huge_pages_node[nid], buf, nid, i); h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); -@@ -3308,7 +3308,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) +@@ -3332,7 +3332,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) if (i < h->max_huge_pages) { char buf[32]; @@ -100699,7 +98413,7 @@ index f791076da..3e5a604ee 100644 pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", h->max_huge_pages, buf, i); h->max_huge_pages = i; -@@ -3354,7 +3354,7 @@ static void __init report_hugepages(void) +@@ -3378,7 +3378,7 @@ static void __init report_hugepages(void) for_each_hstate(h) { char buf[32]; @@ -100708,7 +98422,7 @@ index f791076da..3e5a604ee 100644 pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", buf, h->free_huge_pages); pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", -@@ -4245,7 +4245,7 @@ static int __init hugetlb_init(void) +@@ -4269,7 +4269,7 @@ static int __init hugetlb_init(void) char buf[32]; string_get_size(huge_page_size(&default_hstate), @@ -100717,65 +98431,11 @@ index f791076da..3e5a604ee 100644 pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", default_hstate.max_huge_pages, buf); pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", -diff --git a/mm/kfence/core.c b/mm/kfence/core.c -index dad3c0eb7..aea6fa145 100644 ---- a/mm/kfence/core.c -+++ b/mm/kfence/core.c -@@ -590,9 +590,9 @@ static unsigned long kfence_init_pool(void) - continue; - - __folio_set_slab(slab_folio(slab)); --#ifdef CONFIG_MEMCG -- slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg | -- MEMCG_DATA_OBJCGS; -+#ifdef CONFIG_MEMCG_KMEM -+ slab->obj_exts = (unsigned long)&kfence_metadata[i / 2 - 1].obj_exts | -+ MEMCG_DATA_OBJEXTS; - #endif - } - -@@ -634,8 +634,8 @@ static unsigned long kfence_init_pool(void) - - if (!i || (i % 2)) - continue; --#ifdef CONFIG_MEMCG -- slab->memcg_data = 0; -+#ifdef CONFIG_MEMCG_KMEM -+ slab->obj_exts = 0; - #endif - __folio_clear_slab(slab_folio(slab)); - } -@@ -1093,8 +1093,8 @@ void __kfence_free(void *addr) - { - struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); - --#ifdef CONFIG_MEMCG -- KFENCE_WARN_ON(meta->objcg); -+#ifdef CONFIG_MEMCG_KMEM -+ KFENCE_WARN_ON(meta->obj_exts.objcg); - #endif - /* - * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing -diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h -index 392fb273e..b02d2cb96 100644 ---- a/mm/kfence/kfence.h -+++ b/mm/kfence/kfence.h -@@ -97,8 +97,8 @@ struct kfence_metadata { - struct kfence_track free_track; - /* For updating alloc_covered on frees. */ - u32 alloc_stack_hash; --#ifdef CONFIG_MEMCG -- struct obj_cgroup *objcg; -+#ifdef CONFIG_MEMCG_KMEM -+ struct slabobj_ext obj_exts; - #endif - }; - diff --git a/mm/madvise.c b/mm/madvise.c -index b5ffbaf61..e08639a7c 100644 +index ec30f48f8..fa2f140d0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c -@@ -1311,6 +1311,64 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, +@@ -1330,6 +1330,64 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ @@ -100840,7 +98500,7 @@ index b5ffbaf61..e08639a7c 100644 /* * The madvise(2) system call. * -@@ -1390,6 +1448,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh +@@ -1409,6 +1467,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh size_t len; struct blk_plug plug; @@ -100850,372 +98510,8 @@ index b5ffbaf61..e08639a7c 100644 if (!madvise_behavior_valid(behavior)) return -EINVAL; -diff --git a/mm/memcontrol.c b/mm/memcontrol.c -index 4b27e245a..f2a7fe718 100644 ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -2892,13 +2892,6 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) - } - - #ifdef CONFIG_MEMCG_KMEM --/* -- * The allocated objcg pointers array is not accounted directly. -- * Moreover, it should not come from DMA buffer and is not readily -- * reclaimable. So those GFP bits should be masked off. -- */ --#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) -- - /* - * mod_objcg_mlstate() may be called with irq enabled, so - * mod_memcg_lruvec_state() should be used. -@@ -2917,62 +2910,27 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, - rcu_read_unlock(); - } - --int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, -- gfp_t gfp, bool new_slab) --{ -- unsigned int objects = objs_per_slab(s, slab); -- unsigned long memcg_data; -- void *vec; -- -- gfp &= ~OBJCGS_CLEAR_MASK; -- vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, -- slab_nid(slab)); -- if (!vec) -- return -ENOMEM; -- -- memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; -- if (new_slab) { -- /* -- * If the slab is brand new and nobody can yet access its -- * memcg_data, no synchronization is required and memcg_data can -- * be simply assigned. -- */ -- slab->memcg_data = memcg_data; -- } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) { -- /* -- * If the slab is already in use, somebody can allocate and -- * assign obj_cgroups in parallel. In this case the existing -- * objcg vector should be reused. -- */ -- kfree(vec); -- return 0; -- } -- -- kmemleak_not_leak(vec); -- return 0; --} -- - static __always_inline - struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) - { - /* - * Slab objects are accounted individually, not per-page. - * Memcg membership data for each individual object is saved in -- * slab->memcg_data. -+ * slab->obj_exts. - */ - if (folio_test_slab(folio)) { -- struct obj_cgroup **objcgs; -+ struct slabobj_ext *obj_exts; - struct slab *slab; - unsigned int off; - - slab = folio_slab(folio); -- objcgs = slab_objcgs(slab); -- if (!objcgs) -+ obj_exts = slab_obj_exts(slab); -+ if (!obj_exts) - return NULL; - - off = obj_to_index(slab->slab_cache, slab, p); -- if (objcgs[off]) -- return obj_cgroup_memcg(objcgs[off]); -+ if (obj_exts[off].objcg) -+ return obj_cgroup_memcg(obj_exts[off].objcg); - - return NULL; - } -@@ -2980,7 +2938,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) - /* - * folio_memcg_check() is used here, because in theory we can encounter - * a folio where the slab flag has been cleared already, but -- * slab->memcg_data has not been freed yet -+ * slab->obj_exts has not been freed yet - * folio_memcg_check() will guarantee that a proper memory - * cgroup pointer or NULL will be returned. - */ -diff --git a/mm/mempolicy.c b/mm/mempolicy.c -index 1756389a0..aaf767767 100644 ---- a/mm/mempolicy.c -+++ b/mm/mempolicy.c -@@ -2109,7 +2109,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, - { - struct page *page; - -- page = __alloc_pages(gfp, order, nid, NULL); -+ page = __alloc_pages_noprof(gfp, order, nid, NULL); - /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ - if (!static_branch_likely(&vm_numa_stat_key)) - return page; -@@ -2135,15 +2135,15 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, - */ - preferred_gfp = gfp | __GFP_NOWARN; - preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); -- page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); -+ page = __alloc_pages_noprof(preferred_gfp, order, nid, &pol->nodes); - if (!page) -- page = __alloc_pages(gfp, order, nid, NULL); -+ page = __alloc_pages_noprof(gfp, order, nid, NULL); - - return page; - } - - /** -- * vma_alloc_folio - Allocate a folio for a VMA. -+ * vma_alloc_folio_noprof - Allocate a folio for a VMA. - * @gfp: GFP flags. - * @order: Order of the folio. - * @vma: Pointer to VMA or NULL if not available. -@@ -2157,7 +2157,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, - * - * Return: The folio on success or NULL if allocation fails. - */ --struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, -+struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage) - { - struct mempolicy *pol; -@@ -2228,7 +2228,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, - * memory with both reclaim and compact as well. - */ - if (!folio && (gfp & __GFP_DIRECT_RECLAIM)) -- folio = __folio_alloc(gfp, order, hpage_node, -+ folio = __folio_alloc_noprof(gfp, order, hpage_node, - nmask); - - goto out; -@@ -2237,15 +2237,15 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, - - nmask = policy_nodemask(gfp, pol); - preferred_nid = policy_node(gfp, pol, node); -- folio = __folio_alloc(gfp, order, preferred_nid, nmask); -+ folio = __folio_alloc_noprof(gfp, order, preferred_nid, nmask); - mpol_cond_put(pol); - out: - return folio; - } --EXPORT_SYMBOL(vma_alloc_folio); -+EXPORT_SYMBOL(vma_alloc_folio_noprof); - - /** -- * alloc_pages - Allocate pages. -+ * alloc_pages_noprof - Allocate pages. - * @gfp: GFP flags. - * @order: Power of two of number of pages to allocate. - * -@@ -2258,7 +2258,7 @@ EXPORT_SYMBOL(vma_alloc_folio); - * flags are used. - * Return: The page on success or NULL if allocation fails. - */ --struct page *alloc_pages(gfp_t gfp, unsigned order) -+struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) - { - struct mempolicy *pol = &default_policy; - struct page *page; -@@ -2276,23 +2276,23 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) - page = alloc_pages_preferred_many(gfp, order, - policy_node(gfp, pol, numa_node_id()), pol); - else -- page = __alloc_pages(gfp, order, -+ page = __alloc_pages_noprof(gfp, order, - policy_node(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol)); - - return page; - } --EXPORT_SYMBOL(alloc_pages); -+EXPORT_SYMBOL(alloc_pages_noprof); - --struct folio *folio_alloc(gfp_t gfp, unsigned order) -+struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) - { -- struct page *page = alloc_pages(gfp | __GFP_COMP, order); -+ struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); - - if (page && order > 1) - prep_transhuge_page(page); - return (struct folio *)page; - } --EXPORT_SYMBOL(folio_alloc); -+EXPORT_SYMBOL(folio_alloc_noprof); - - static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, - struct mempolicy *pol, unsigned long nr_pages, -@@ -2311,13 +2311,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, - - for (i = 0; i < nodes; i++) { - if (delta) { -- nr_allocated = __alloc_pages_bulk(gfp, -+ nr_allocated = alloc_pages_bulk_noprof(gfp, - interleave_nodes(pol), NULL, - nr_pages_per_node + 1, NULL, - page_array); - delta--; - } else { -- nr_allocated = __alloc_pages_bulk(gfp, -+ nr_allocated = alloc_pages_bulk_noprof(gfp, - interleave_nodes(pol), NULL, - nr_pages_per_node, NULL, page_array); - } -@@ -2339,11 +2339,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, - preferred_gfp = gfp | __GFP_NOWARN; - preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - -- nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, -+ nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, - nr_pages, NULL, page_array); - - if (nr_allocated < nr_pages) -- nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, -+ nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, - nr_pages - nr_allocated, NULL, - page_array + nr_allocated); - return nr_allocated; -@@ -2355,7 +2355,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, - * It can accelerate memory allocation especially interleaving - * allocate memory. - */ --unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, -+unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, - unsigned long nr_pages, struct page **page_array) - { - struct mempolicy *pol = &default_policy; -@@ -2371,7 +2371,7 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, - return alloc_pages_bulk_array_preferred_many(gfp, - numa_node_id(), pol, nr_pages, page_array); - -- return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), -+ return alloc_pages_bulk_noprof(gfp, policy_node(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol), nr_pages, NULL, - page_array); - } -diff --git a/mm/mempool.c b/mm/mempool.c -index 734bcf5af..4fd949178 100644 ---- a/mm/mempool.c -+++ b/mm/mempool.c -@@ -230,17 +230,17 @@ EXPORT_SYMBOL(mempool_init_node); - * - * Return: %0 on success, negative error code otherwise. - */ --int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, -- mempool_free_t *free_fn, void *pool_data) -+int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, -+ mempool_free_t *free_fn, void *pool_data) - { - return mempool_init_node(pool, min_nr, alloc_fn, free_fn, - pool_data, GFP_KERNEL, NUMA_NO_NODE); - - } --EXPORT_SYMBOL(mempool_init); -+EXPORT_SYMBOL(mempool_init_noprof); - - /** -- * mempool_create - create a memory pool -+ * mempool_create_node - create a memory pool - * @min_nr: the minimum number of elements guaranteed to be - * allocated for this pool. - * @alloc_fn: user-defined element-allocation function. -@@ -255,17 +255,9 @@ EXPORT_SYMBOL(mempool_init); - * - * Return: pointer to the created memory pool object or %NULL on error. - */ --mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, -- mempool_free_t *free_fn, void *pool_data) --{ -- return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data, -- GFP_KERNEL, NUMA_NO_NODE); --} --EXPORT_SYMBOL(mempool_create); -- --mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, -- mempool_free_t *free_fn, void *pool_data, -- gfp_t gfp_mask, int node_id) -+mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, -+ mempool_free_t *free_fn, void *pool_data, -+ gfp_t gfp_mask, int node_id) - { - mempool_t *pool; - -@@ -281,7 +273,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, - - return pool; - } --EXPORT_SYMBOL(mempool_create_node); -+EXPORT_SYMBOL(mempool_create_node_noprof); - - /** - * mempool_resize - resize an existing memory pool -@@ -377,7 +369,7 @@ EXPORT_SYMBOL(mempool_resize); - * - * Return: pointer to the allocated element or %NULL on error. - */ --void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) -+void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) - { - void *element; - unsigned long flags; -@@ -444,7 +436,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) - finish_wait(&pool->wait, &wait); - goto repeat_alloc; - } --EXPORT_SYMBOL(mempool_alloc); -+EXPORT_SYMBOL(mempool_alloc_noprof); - - /** - * mempool_free - return an element to the pool. -@@ -515,7 +507,7 @@ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) - { - struct kmem_cache *mem = pool_data; - VM_BUG_ON(mem->ctor); -- return kmem_cache_alloc(mem, gfp_mask); -+ return kmem_cache_alloc_noprof(mem, gfp_mask); - } - EXPORT_SYMBOL(mempool_alloc_slab); - -@@ -533,7 +525,7 @@ EXPORT_SYMBOL(mempool_free_slab); - void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) - { - size_t size = (size_t)pool_data; -- return kmalloc(size, gfp_mask); -+ return kmalloc_noprof(size, gfp_mask); - } - EXPORT_SYMBOL(mempool_kmalloc); - -@@ -550,7 +542,7 @@ EXPORT_SYMBOL(mempool_kfree); - void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) - { - int order = (int)(long)pool_data; -- return alloc_pages(gfp_mask, order); -+ return alloc_pages_noprof(gfp_mask, order); - } - EXPORT_SYMBOL(mempool_alloc_pages); - -diff --git a/mm/mm_init.c b/mm/mm_init.c -index 7f7f9c677..42135fad4 100644 ---- a/mm/mm_init.c -+++ b/mm/mm_init.c -@@ -24,6 +24,7 @@ - #include - #include - #include -+#include - #include - #include - #include "internal.h" diff --git a/mm/oom_kill.c b/mm/oom_kill.c -index 044e1eed7..f2657245e 100644 +index 612b5597d..467cff51f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -168,27 +168,6 @@ static bool oom_unkillable_task(struct task_struct *p) @@ -101255,582 +98551,24 @@ index 044e1eed7..f2657245e 100644 } if (sysctl_oom_dump_tasks) dump_tasks(oc); -diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 47421bedc..e20ef7a00 100644 ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -74,6 +74,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -1259,6 +1260,7 @@ static __always_inline bool free_pages_prepare(struct page *page, - __memcg_kmem_uncharge_page(page, order); - reset_page_owner(page, order); - page_table_check_free(page, order); -+ pgalloc_tag_sub(page, order); - return false; - } - -@@ -1301,6 +1303,7 @@ static __always_inline bool free_pages_prepare(struct page *page, - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - reset_page_owner(page, order); - page_table_check_free(page, order); -+ pgalloc_tag_sub(page, order); - - if (!PageHighMem(page)) { - debug_check_no_locks_freed(page_address(page), -@@ -1730,6 +1733,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, - - set_page_owner(page, order, gfp_flags); - page_table_check_alloc(page, order); -+ pgalloc_tag_add(page, current, order); - } - - static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, -@@ -2790,6 +2794,7 @@ void split_page(struct page *page, unsigned int order) - for (i = 1; i < (1 << order); i++) - set_page_refcounted(page + i); - split_page_owner(page, 1 << order); -+ pgalloc_tag_split(page, 1 << order); - split_page_memcg(page, 1 << order); - } - EXPORT_SYMBOL_GPL(split_page); -@@ -4577,7 +4582,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, - * - * Returns the number of pages on the list or array. - */ --unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, -+unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, - nodemask_t *nodemask, int nr_pages, - struct list_head *page_list, - struct page **page_array) -@@ -4713,7 +4718,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, - pcp_trylock_finish(UP_flags); - - failed: -- page = __alloc_pages(gfp, 0, preferred_nid, nodemask); -+ page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask); - if (page) { - if (page_list) - list_add(&page->lru, page_list); -@@ -4724,13 +4729,13 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, - - goto out; - } --EXPORT_SYMBOL_GPL(__alloc_pages_bulk); -+EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); - - /* - * This is the 'heart' of the zoned buddy allocator. - */ --struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, -- nodemask_t *nodemask) -+struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, -+ int preferred_nid, nodemask_t *nodemask) - { - struct page *page; - unsigned int alloc_flags = ALLOC_WMARK_LOW; -@@ -4792,41 +4797,41 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, - - return page; - } --EXPORT_SYMBOL(__alloc_pages); -+EXPORT_SYMBOL(__alloc_pages_noprof); - --struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, -+struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, - nodemask_t *nodemask) - { -- struct page *page = __alloc_pages(gfp | __GFP_COMP, order, -+ struct page *page = __alloc_pages_noprof(gfp | __GFP_COMP, order, - preferred_nid, nodemask); - - if (page && order > 1) - prep_transhuge_page(page); - return (struct folio *)page; - } --EXPORT_SYMBOL(__folio_alloc); -+EXPORT_SYMBOL(__folio_alloc_noprof); - - /* - * Common helper functions. Never use with __GFP_HIGHMEM because the returned - * address cannot represent highmem pages. Use alloc_pages and then kmap if - * you need to access high mem. - */ --unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) -+unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order) - { - struct page *page; - -- page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); -+ page = alloc_pages_noprof(gfp_mask & ~__GFP_HIGHMEM, order); - if (!page) - return 0; - return (unsigned long) page_address(page); - } --EXPORT_SYMBOL(__get_free_pages); -+EXPORT_SYMBOL(get_free_pages_noprof); - --unsigned long get_zeroed_page(gfp_t gfp_mask) -+unsigned long get_zeroed_page_noprof(gfp_t gfp_mask) - { -- return __get_free_page(gfp_mask | __GFP_ZERO); -+ return get_free_pages_noprof(gfp_mask | __GFP_ZERO, 0); - } --EXPORT_SYMBOL(get_zeroed_page); -+EXPORT_SYMBOL(get_zeroed_page_noprof); - - /** - * __free_pages - Free pages allocated with alloc_pages(). -@@ -5006,6 +5011,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, - struct page *last = page + nr; - - split_page_owner(page, 1 << order); -+ pgalloc_tag_split(page, 1 << order); - split_page_memcg(page, 1 << order); - while (page < --last) - set_page_refcounted(last); -@@ -5018,7 +5024,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, - } - - /** -- * alloc_pages_exact - allocate an exact number physically-contiguous pages. -+ * alloc_pages_exact_noprof - allocate an exact number physically-contiguous pages. - * @size: the number of bytes to allocate - * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP - * -@@ -5032,7 +5038,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, - * - * Return: pointer to the allocated area or %NULL in case of error. - */ --void *alloc_pages_exact(size_t size, gfp_t gfp_mask) -+void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) - { - unsigned int order = get_order(size); - unsigned long addr; -@@ -5040,13 +5046,13 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) - if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) - gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); - -- addr = __get_free_pages(gfp_mask, order); -+ addr = get_free_pages_noprof(gfp_mask, order); - return make_alloc_exact(addr, order, size); - } --EXPORT_SYMBOL(alloc_pages_exact); -+EXPORT_SYMBOL(alloc_pages_exact_noprof); - - /** -- * alloc_pages_exact_nid - allocate an exact number of physically-contiguous -+ * alloc_pages_exact_nid_noprof - allocate an exact number of physically-contiguous - * pages on a node. - * @nid: the preferred node ID where memory should be allocated - * @size: the number of bytes to allocate -@@ -5057,7 +5063,7 @@ EXPORT_SYMBOL(alloc_pages_exact); - * - * Return: pointer to the allocated area or %NULL in case of error. - */ --void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) -+void * __meminit alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) - { - unsigned int order = get_order(size); - struct page *p; -@@ -5065,7 +5071,7 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) - if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) - gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); - -- p = alloc_pages_node(nid, gfp_mask, order); -+ p = alloc_pages_node_noprof(nid, gfp_mask, order); - if (!p) - return NULL; - return make_alloc_exact((unsigned long)page_address(p), order, size); -@@ -6738,7 +6744,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, - } - - /** -- * alloc_contig_range() -- tries to allocate given range of pages -+ * alloc_contig_range_noprof() -- tries to allocate given range of pages - * @start: start PFN to allocate - * @end: one-past-the-last PFN to allocate - * @migratetype: migratetype of the underlying pageblocks (either -@@ -6758,7 +6764,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, - * pages which PFN is in [start, end) are allocated for the caller and - * need to be freed with free_contig_range(). - */ --int alloc_contig_range(unsigned long start, unsigned long end, -+int alloc_contig_range_noprof(unsigned long start, unsigned long end, - unsigned migratetype, gfp_t gfp_mask) - { - unsigned long outer_start, outer_end; -@@ -6882,15 +6888,15 @@ int alloc_contig_range(unsigned long start, unsigned long end, - undo_isolate_page_range(start, end, migratetype); - return ret; - } --EXPORT_SYMBOL(alloc_contig_range); -+EXPORT_SYMBOL(alloc_contig_range_noprof); - - static int __alloc_contig_pages(unsigned long start_pfn, - unsigned long nr_pages, gfp_t gfp_mask) - { - unsigned long end_pfn = start_pfn + nr_pages; - -- return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, -- gfp_mask); -+ return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE, -+ gfp_mask); - } - - static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, -@@ -6925,7 +6931,7 @@ static bool zone_spans_last_pfn(const struct zone *zone, - } - - /** -- * alloc_contig_pages() -- tries to find and allocate contiguous range of pages -+ * alloc_contig_pages_noprof() -- tries to find and allocate contiguous range of pages - * @nr_pages: Number of contiguous pages to allocate - * @gfp_mask: GFP mask to limit search and used during compaction - * @nid: Target node -@@ -6945,8 +6951,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, - * - * Return: pointer to contiguous pages on success, or NULL if not successful. - */ --struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, -- int nid, nodemask_t *nodemask) -+struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, -+ int nid, nodemask_t *nodemask) - { - unsigned long ret, pfn, flags; - struct zonelist *zonelist; -diff --git a/mm/page_ext.c b/mm/page_ext.c -index dc1626be4..6c8ad6e12 100644 ---- a/mm/page_ext.c -+++ b/mm/page_ext.c -@@ -10,6 +10,7 @@ - #include - #include - #include -+#include - - /* - * struct page extension -@@ -82,6 +83,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { - #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) - &page_idle_ops, - #endif -+#ifdef CONFIG_MEM_ALLOC_PROFILING -+ &page_alloc_tagging_ops, -+#endif - #ifdef CONFIG_PAGE_TABLE_CHECK - &page_table_check_ops, - #endif -@@ -92,7 +96,16 @@ unsigned long page_ext_size; - static unsigned long total_usage; - static struct page_ext *lookup_page_ext(const struct page *page); - -+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG -+/* -+ * To ensure correct allocation tagging for pages, page_ext should be available -+ * before the first page allocation. Otherwise early task stacks will be -+ * allocated before page_ext initialization and missing tags will be flagged. -+ */ -+bool early_page_ext __meminitdata = true; -+#else - bool early_page_ext __meminitdata; -+#endif - static int __init setup_early_page_ext(char *str) - { - early_page_ext = true; -diff --git a/mm/page_owner.c b/mm/page_owner.c -index 31169b3e7..8b6086c66 100644 ---- a/mm/page_owner.c -+++ b/mm/page_owner.c -@@ -372,7 +372,7 @@ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, - if (!memcg_data) - goto out_unlock; - -- if (memcg_data & MEMCG_DATA_OBJCGS) -+ if (memcg_data & MEMCG_DATA_OBJEXTS) - ret += scnprintf(kbuf + ret, count - ret, - "Slab cache page\n"); - -diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h -index f9847c131..c5d1d6723 100644 ---- a/mm/percpu-internal.h -+++ b/mm/percpu-internal.h -@@ -32,6 +32,19 @@ struct pcpu_block_md { - int nr_bits; /* total bits responsible for */ - }; - -+struct pcpuobj_ext { -+#ifdef CONFIG_MEMCG_KMEM -+ struct obj_cgroup *cgroup; -+#endif -+#ifdef CONFIG_MEM_ALLOC_PROFILING -+ union codetag_ref tag; -+#endif -+}; -+ -+#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MEM_ALLOC_PROFILING) -+#define NEED_PCPUOBJ_EXT -+#endif -+ - struct pcpu_chunk { - #ifdef CONFIG_PERCPU_STATS - int nr_alloc; /* # of allocations */ -@@ -57,8 +70,8 @@ struct pcpu_chunk { - int end_offset; /* additional area required to - have the region end page - aligned */ --#ifdef CONFIG_MEMCG_KMEM -- struct obj_cgroup **obj_cgroups; /* vector of object cgroups */ -+#ifdef NEED_PCPUOBJ_EXT -+ struct pcpuobj_ext *obj_exts; /* vector of object cgroups */ - #endif - - int nr_pages; /* # of pages served by this chunk */ -@@ -67,6 +80,15 @@ struct pcpu_chunk { - unsigned long populated[]; /* populated bitmap */ - }; - -+static inline bool need_pcpuobj_ext(void) -+{ -+ if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING)) -+ return true; -+ if (!mem_cgroup_kmem_disabled()) -+ return true; -+ return false; -+} -+ - extern spinlock_t pcpu_lock; - - extern struct list_head *pcpu_chunk_lists; -diff --git a/mm/percpu.c b/mm/percpu.c -index 28e07ede4..2298f38d4 100644 ---- a/mm/percpu.c -+++ b/mm/percpu.c -@@ -1392,9 +1392,9 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); - --#ifdef CONFIG_MEMCG_KMEM -+#ifdef NEED_PCPUOBJ_EXT - /* first chunk is free to use */ -- chunk->obj_cgroups = NULL; -+ chunk->obj_exts = NULL; - #endif - pcpu_init_md_blocks(chunk); - -@@ -1463,12 +1463,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) - if (!chunk->md_blocks) - goto md_blocks_fail; - --#ifdef CONFIG_MEMCG_KMEM -- if (!mem_cgroup_kmem_disabled()) { -- chunk->obj_cgroups = -+#ifdef NEED_PCPUOBJ_EXT -+ if (need_pcpuobj_ext()) { -+ chunk->obj_exts = - pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) * -- sizeof(struct obj_cgroup *), gfp); -- if (!chunk->obj_cgroups) -+ sizeof(struct pcpuobj_ext), gfp); -+ if (!chunk->obj_exts) - goto objcg_fail; - } - #endif -@@ -1480,7 +1480,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) - - return chunk; - --#ifdef CONFIG_MEMCG_KMEM -+#ifdef NEED_PCPUOBJ_EXT - objcg_fail: - pcpu_mem_free(chunk->md_blocks); - #endif -@@ -1498,8 +1498,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) - { - if (!chunk) - return; --#ifdef CONFIG_MEMCG_KMEM -- pcpu_mem_free(chunk->obj_cgroups); -+#ifdef NEED_PCPUOBJ_EXT -+ pcpu_mem_free(chunk->obj_exts); - #endif - pcpu_mem_free(chunk->md_blocks); - pcpu_mem_free(chunk->bound_map); -@@ -1648,8 +1648,8 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, - if (!objcg) - return; - -- if (likely(chunk && chunk->obj_cgroups)) { -- chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; -+ if (likely(chunk && chunk->obj_exts)) { -+ chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg; - - rcu_read_lock(); - mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, -@@ -1665,13 +1665,13 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) - { - struct obj_cgroup *objcg; - -- if (unlikely(!chunk->obj_cgroups)) -+ if (unlikely(!chunk->obj_exts)) - return; - -- objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT]; -+ objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup; - if (!objcg) - return; -- chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; -+ chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL; - - obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); - -@@ -1701,8 +1701,34 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) - } - #endif /* CONFIG_MEMCG_KMEM */ - -+#ifdef CONFIG_MEM_ALLOC_PROFILING -+static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, -+ size_t size) -+{ -+ if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) { -+ alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, -+ current->alloc_tag, size); -+ } -+} -+ -+static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size) -+{ -+ if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) -+ alloc_tag_sub_noalloc(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size); -+} -+#else -+static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, -+ size_t size) -+{ -+} -+ -+static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size) -+{ -+} -+#endif -+ - /** -- * pcpu_alloc - the percpu allocator -+ * pcpu_alloc_noprof - the percpu allocator - * @size: size of area to allocate in bytes - * @align: alignment of area (max PAGE_SIZE) - * @reserved: allocate from the reserved chunk if available -@@ -1716,7 +1742,7 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) - * RETURNS: - * Percpu pointer to the allocated area on success, NULL on failure. - */ --static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, -+void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, - gfp_t gfp) - { - gfp_t pcpu_gfp; -@@ -1883,6 +1909,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, - - pcpu_memcg_post_alloc_hook(objcg, chunk, off, size); - -+ pcpu_alloc_tag_alloc_hook(chunk, off, size); -+ - return ptr; - - fail_unlock: -@@ -1909,61 +1937,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, - - return NULL; - } -- --/** -- * __alloc_percpu_gfp - allocate dynamic percpu area -- * @size: size of area to allocate in bytes -- * @align: alignment of area (max PAGE_SIZE) -- * @gfp: allocation flags -- * -- * Allocate zero-filled percpu area of @size bytes aligned at @align. If -- * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can -- * be called from any context but is a lot more likely to fail. If @gfp -- * has __GFP_NOWARN then no warning will be triggered on invalid or failed -- * allocation requests. -- * -- * RETURNS: -- * Percpu pointer to the allocated area on success, NULL on failure. -- */ --void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) --{ -- return pcpu_alloc(size, align, false, gfp); --} --EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); -- --/** -- * __alloc_percpu - allocate dynamic percpu area -- * @size: size of area to allocate in bytes -- * @align: alignment of area (max PAGE_SIZE) -- * -- * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). -- */ --void __percpu *__alloc_percpu(size_t size, size_t align) --{ -- return pcpu_alloc(size, align, false, GFP_KERNEL); --} --EXPORT_SYMBOL_GPL(__alloc_percpu); -- --/** -- * __alloc_reserved_percpu - allocate reserved percpu area -- * @size: size of area to allocate in bytes -- * @align: alignment of area (max PAGE_SIZE) -- * -- * Allocate zero-filled percpu area of @size bytes aligned at @align -- * from reserved percpu area if arch has set it up; otherwise, -- * allocation is served from the same dynamic area. Might sleep. -- * Might trigger writeouts. -- * -- * CONTEXT: -- * Does GFP_KERNEL allocation. -- * -- * RETURNS: -- * Percpu pointer to the allocated area on success, NULL on failure. -- */ --void __percpu *__alloc_reserved_percpu(size_t size, size_t align) --{ -- return pcpu_alloc(size, align, true, GFP_KERNEL); --} -+EXPORT_SYMBOL_GPL(pcpu_alloc_noprof); - - /** - * pcpu_balance_free - manage the amount of free chunks -@@ -2273,6 +2247,8 @@ void free_percpu(void __percpu *ptr) - - size = pcpu_free_area(chunk, off); - -+ pcpu_alloc_tag_free_hook(chunk, off, size); -+ - pcpu_memcg_free_hook(chunk, off, size); - - /* -diff --git a/lib/show_mem.c b/mm/show_mem.c -similarity index 57% -rename from lib/show_mem.c -rename to mm/show_mem.c -index 1485c87be..de209c55d 100644 ---- a/lib/show_mem.c +diff --git a/mm/show_mem.c b/mm/show_mem.c +index 01f8e9905..94ebd86c8 100644 +--- a/mm/show_mem.c +++ b/mm/show_mem.c -@@ -7,11 +7,15 @@ - +@@ -12,10 +12,12 @@ + #include #include - #include + #include +#include -+ -+#include "slab.h" + #include + #include - void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) + #include "internal.h" ++#include "slab.h" + #include "swap.h" + + atomic_long_t _totalram_pages __read_mostly; +@@ -404,6 +406,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { unsigned long total = 0, reserved = 0, highmem = 0; struct zone *zone; @@ -101838,7 +98576,7 @@ index 1485c87be..de209c55d 100644 printk("Mem-Info:\n"); __show_free_areas(filter, nodemask, max_zone_idx); -@@ -34,4 +38,37 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) +@@ -426,4 +429,23 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif @@ -101861,470 +98599,12 @@ index 1485c87be..de209c55d 100644 + + kfree(buf); + } -+#ifdef CONFIG_MEM_ALLOC_PROFILING -+ { -+ struct seq_buf s; -+ char *buf = kmalloc(4096, GFP_ATOMIC); -+ -+ if (buf) { -+ printk("Memory allocations:\n"); -+ seq_buf_init(&s, buf, 4096); -+ alloc_tags_show_mem_report(&s); -+ printk("%s", buf); -+ kfree(buf); -+ } -+ } -+#endif } -diff --git a/mm/slab.c b/mm/slab.c -index bb57f7fdb..d02d2dd27 100644 ---- a/mm/slab.c -+++ b/mm/slab.c -@@ -1232,7 +1232,7 @@ void __init kmem_cache_init(void) - create_boot_cache(kmem_cache, "kmem_cache", - offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *), -- SLAB_HWCACHE_ALIGN, 0, 0); -+ SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); - list_add(&kmem_cache->list, &slab_caches); - slab_state = PARTIAL; - -@@ -3367,9 +3367,11 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) - static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, - unsigned long caller) - { -+ struct slab *slab = virt_to_slab(objp); - bool init; - -- memcg_slab_free_hook(cachep, virt_to_slab(objp), &objp, 1); -+ memcg_slab_free_hook(cachep, slab, &objp, 1); -+ alloc_tagging_slab_free_hook(cachep, slab, &objp, 1); - - if (is_kfence_address(objp)) { - kmemleak_free_recursive(objp, cachep->flags); -@@ -3446,18 +3448,18 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, - return ret; - } - --void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) -+void *kmem_cache_alloc_noprof(struct kmem_cache *cachep, gfp_t flags) - { - return __kmem_cache_alloc_lru(cachep, NULL, flags); - } --EXPORT_SYMBOL(kmem_cache_alloc); -+EXPORT_SYMBOL(kmem_cache_alloc_noprof); - --void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, -+void *kmem_cache_alloc_lru_noprof(struct kmem_cache *cachep, struct list_lru *lru, - gfp_t flags) - { - return __kmem_cache_alloc_lru(cachep, lru, flags); - } --EXPORT_SYMBOL(kmem_cache_alloc_lru); -+EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); - - static __always_inline void - cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, -@@ -3469,8 +3471,8 @@ cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, - p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller); - } - --int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, -- void **p) -+int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, -+ void **p) - { - struct obj_cgroup *objcg = NULL; - unsigned long irqflags; -@@ -3508,7 +3510,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - kmem_cache_free_bulk(s, i, p); - return 0; - } --EXPORT_SYMBOL(kmem_cache_alloc_bulk); -+EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); - - /** - * kmem_cache_alloc_node - Allocate an object on the specified node -@@ -3523,7 +3525,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); - * - * Return: pointer to the new object or %NULL in case of error - */ --void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) -+void *kmem_cache_alloc_node_noprof(struct kmem_cache *cachep, gfp_t flags, int nodeid) - { - void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); - -@@ -3531,7 +3533,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) - - return ret; - } --EXPORT_SYMBOL(kmem_cache_alloc_node); -+EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); - - void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, - int nodeid, size_t orig_size, diff --git a/mm/slab.h b/mm/slab.h -index f01ac256a..bc2d3429d 100644 +index 9c0e09d0f..7bcf32b47 100644 --- a/mm/slab.h +++ b/mm/slab.h -@@ -57,8 +57,8 @@ struct slab { - #endif - - atomic_t __page_refcount; --#ifdef CONFIG_MEMCG -- unsigned long memcg_data; -+#ifdef CONFIG_SLAB_OBJ_EXT -+ unsigned long obj_exts; - #endif - }; - -@@ -67,8 +67,8 @@ struct slab { - SLAB_MATCH(flags, __page_flags); - SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ - SLAB_MATCH(_refcount, __page_refcount); --#ifdef CONFIG_MEMCG --SLAB_MATCH(memcg_data, memcg_data); -+#ifdef CONFIG_SLAB_OBJ_EXT -+SLAB_MATCH(memcg_data, obj_exts); - #endif - #undef SLAB_MATCH - static_assert(sizeof(struct slab) <= sizeof(struct page)); -@@ -390,36 +390,198 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla - return false; - } - --#ifdef CONFIG_MEMCG_KMEM -+#ifdef CONFIG_SLAB_OBJ_EXT -+ - /* -- * slab_objcgs - get the object cgroups vector associated with a slab -+ * slab_obj_exts - get the pointer to the slab object extension vector -+ * associated with a slab. - * @slab: a pointer to the slab struct - * -- * Returns a pointer to the object cgroups vector associated with the slab, -+ * Returns a pointer to the object extension vector associated with the slab, - * or NULL if no such vector has been associated yet. - */ --static inline struct obj_cgroup **slab_objcgs(struct slab *slab) -+static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) - { -- unsigned long memcg_data = READ_ONCE(slab->memcg_data); -+ unsigned long obj_exts = READ_ONCE(slab->obj_exts); - -- VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), -+#ifdef CONFIG_MEMCG -+ VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS), - slab_page(slab)); -- VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab)); -+ VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); - -- return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); -+#endif -+ return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); - } - --int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, -- gfp_t gfp, bool new_slab); --void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, -- enum node_stat_item idx, int nr); -+int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, -+ gfp_t gfp, bool new_slab); -+ -+ -+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG -+ -+static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) -+{ -+ struct slabobj_ext *slab_exts; -+ struct slab *obj_exts_slab; -+ -+ obj_exts_slab = virt_to_slab(obj_exts); -+ slab_exts = slab_obj_exts(obj_exts_slab); -+ if (slab_exts) { -+ unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, -+ obj_exts_slab, obj_exts); -+ /* codetag should be NULL */ -+ WARN_ON(slab_exts[offs].ref.ct); -+ set_codetag_empty(&slab_exts[offs].ref); -+ } -+} -+ -+static inline void mark_failed_objexts_alloc(struct slab *slab) -+{ -+ slab->obj_exts = OBJEXTS_ALLOC_FAIL; -+} -+ -+static inline void handle_failed_objexts_alloc(unsigned long obj_exts, -+ struct slabobj_ext *vec, unsigned int objects) -+{ -+ /* -+ * If vector previously failed to allocate then we have live -+ * objects with no tag reference. Mark all references in this -+ * vector as empty to avoid warnings later on. -+ */ -+ if (obj_exts & OBJEXTS_ALLOC_FAIL) { -+ unsigned int i; -+ -+ for (i = 0; i < objects; i++) -+ set_codetag_empty(&vec[i].ref); -+ } -+} -+ -+ -+#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ -+ -+static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {} -+static inline void mark_failed_objexts_alloc(struct slab *slab) {} -+static inline void handle_failed_objexts_alloc(unsigned long obj_exts, -+ struct slabobj_ext *vec, unsigned int objects) {} -+ -+#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ -+ -+static inline bool need_slab_obj_ext(void) -+{ -+#ifdef CONFIG_MEM_ALLOC_PROFILING -+ if (mem_alloc_profiling_enabled()) -+ return true; -+#endif -+ /* -+ * CONFIG_MEMCG_KMEM creates vector of obj_cgroup objects conditionally -+ * inside memcg_slab_post_alloc_hook. No other users for now. -+ */ -+ return false; -+} -+ -+static inline void free_slab_obj_exts(struct slab *slab) -+{ -+ struct slabobj_ext *obj_exts; -+ -+ obj_exts = slab_obj_exts(slab); -+ if (!obj_exts) -+ return; -+ -+ /* -+ * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its -+ * corresponding extension will be NULL. alloc_tag_sub() will throw a -+ * warning if slab has extensions but the extension of an object is -+ * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that -+ * the extension for obj_exts is expected to be NULL. -+ */ -+ mark_objexts_empty(obj_exts); -+ kfree(obj_exts); -+ slab->obj_exts = 0; -+} -+ -+static inline struct slabobj_ext * -+prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) -+{ -+ struct slab *slab; -+ -+ if (!p) -+ return NULL; -+ -+ if (!need_slab_obj_ext()) -+ return NULL; -+ -+ if (s->flags & SLAB_NO_OBJ_EXT) -+ return NULL; - --static inline void memcg_free_slab_cgroups(struct slab *slab) -+ if (flags & __GFP_NO_OBJ_EXT) -+ return NULL; -+ -+ slab = virt_to_slab(p); -+ if (!slab_obj_exts(slab) && -+ WARN(alloc_slab_obj_exts(slab, s, flags, false), -+ "%s, %s: Failed to create slab extension vector!\n", -+ __func__, s->name)) -+ return NULL; -+ -+ return slab_obj_exts(slab) + obj_to_index(s, slab, p); -+} -+ -+#else /* CONFIG_SLAB_OBJ_EXT */ -+ -+static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) - { -- kfree(slab_objcgs(slab)); -- slab->memcg_data = 0; -+ return NULL; -+} -+ -+static inline int alloc_slab_obj_exts(struct slab *slab, -+ struct kmem_cache *s, gfp_t gfp, -+ bool new_slab) -+{ -+ return 0; -+} -+ -+static inline void free_slab_obj_exts(struct slab *slab) -+{ -+} -+ -+static inline struct slabobj_ext * -+prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) -+{ -+ return NULL; -+} -+ -+#endif /* CONFIG_SLAB_OBJ_EXT */ -+ -+#ifdef CONFIG_MEM_ALLOC_PROFILING -+ -+static inline void alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, -+ void **p, int objects) -+{ -+ struct slabobj_ext *obj_exts; -+ int i; -+ -+ obj_exts = slab_obj_exts(slab); -+ if (!obj_exts) -+ return; -+ -+ for (i = 0; i < objects; i++) { -+ unsigned int off = obj_to_index(s, slab, p[i]); -+ -+ alloc_tag_sub(&obj_exts[off].ref, s->size); -+ } - } - -+#else -+ -+static inline void alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, -+ void **p, int objects) {} -+ -+#endif /* CONFIG_MEM_ALLOC_PROFILING */ -+ -+#ifdef CONFIG_MEMCG_KMEM -+void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, -+ enum node_stat_item idx, int nr); -+ - static inline size_t obj_full_size(struct kmem_cache *s) - { - /* -@@ -487,16 +649,15 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, - if (likely(p[i])) { - slab = virt_to_slab(p[i]); - -- if (!slab_objcgs(slab) && -- memcg_alloc_slab_cgroups(slab, s, flags, -- false)) { -+ if (!slab_obj_exts(slab) && -+ alloc_slab_obj_exts(slab, s, flags, false)) { - obj_cgroup_uncharge(objcg, obj_full_size(s)); - continue; - } - - off = obj_to_index(s, slab, p[i]); - obj_cgroup_get(objcg); -- slab_objcgs(slab)[off] = objcg; -+ slab_obj_exts(slab)[off].objcg = objcg; - mod_objcg_state(objcg, slab_pgdat(slab), - cache_vmstat_idx(s), obj_full_size(s)); - } else { -@@ -509,14 +670,14 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, - static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - void **p, int objects) - { -- struct obj_cgroup **objcgs; -+ struct slabobj_ext *obj_exts; - int i; - - if (!memcg_kmem_online()) - return; - -- objcgs = slab_objcgs(slab); -- if (!objcgs) -+ obj_exts = slab_obj_exts(slab); -+ if (!obj_exts) - return; - - for (i = 0; i < objects; i++) { -@@ -524,11 +685,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - unsigned int off; - - off = obj_to_index(s, slab, p[i]); -- objcg = objcgs[off]; -+ objcg = obj_exts[off].objcg; - if (!objcg) - continue; - -- objcgs[off] = NULL; -+ obj_exts[off].objcg = NULL; - obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), - -obj_full_size(s)); -@@ -537,27 +698,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - } - - #else /* CONFIG_MEMCG_KMEM */ --static inline struct obj_cgroup **slab_objcgs(struct slab *slab) --{ -- return NULL; --} -- - static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) - { - return NULL; - } - --static inline int memcg_alloc_slab_cgroups(struct slab *slab, -- struct kmem_cache *s, gfp_t gfp, -- bool new_slab) --{ -- return 0; --} -- --static inline void memcg_free_slab_cgroups(struct slab *slab) --{ --} -- - static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, -@@ -594,7 +739,7 @@ static __always_inline void account_slab(struct slab *slab, int order, - struct kmem_cache *s, gfp_t gfp) - { - if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) -- memcg_alloc_slab_cgroups(slab, s, gfp, true); -+ alloc_slab_obj_exts(slab, s, gfp, true); - - mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), - PAGE_SIZE << order); -@@ -603,8 +748,7 @@ static __always_inline void account_slab(struct slab *slab, int order, - static __always_inline void unaccount_slab(struct slab *slab, int order, - struct kmem_cache *s) - { -- if (memcg_kmem_online()) -- memcg_free_slab_cgroups(slab); -+ free_slab_obj_exts(slab); - - mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), - -(PAGE_SIZE << order)); -@@ -684,6 +828,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, - unsigned int orig_size) - { - unsigned int zero_size = s->object_size; -+ struct slabobj_ext *obj_exts; - size_t i; - - flags &= gfp_allowed_mask; -@@ -714,6 +859,13 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, - kmemleak_alloc_recursive(p[i], s->object_size, 1, - s->flags, flags); - kmsan_slab_alloc(s, p[i], flags); -+ obj_exts = prepare_slab_obj_exts_hook(s, flags, p[i]); -+ -+#ifdef CONFIG_MEM_ALLOC_PROFILING -+ /* obj_exts can be allocated for other reasons */ -+ if (likely(obj_exts) && mem_alloc_profiling_enabled()) -+ alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); -+#endif - } - - memcg_slab_post_alloc_hook(s, objcg, flags, size, p); -@@ -766,10 +918,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +@@ -817,10 +817,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) if ((__n = get_node(__s, __node))) @@ -102340,10 +98620,10 @@ index f01ac256a..bc2d3429d 100644 } #endif diff --git a/mm/slab_common.c b/mm/slab_common.c -index 607249785..5b204e16f 100644 +index d1555ea29..fbd6b879d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c -@@ -24,6 +24,7 @@ +@@ -26,6 +26,7 @@ #include #include #include @@ -102351,173 +98631,7 @@ index 607249785..5b204e16f 100644 #include #include "internal.h" -@@ -204,6 +205,64 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, - return NULL; - } - -+#ifdef CONFIG_SLAB_OBJ_EXT -+/* -+ * The allocated objcg pointers array is not accounted directly. -+ * Moreover, it should not come from DMA buffer and is not readily -+ * reclaimable. So those GFP bits should be masked off. -+ */ -+#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) -+ -+int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, -+ gfp_t gfp, bool new_slab) -+{ -+ unsigned int objects = objs_per_slab(s, slab); -+ unsigned long new_exts; -+ unsigned long old_exts; -+ struct slabobj_ext *vec; -+ -+ gfp &= ~OBJCGS_CLEAR_MASK; -+ /* Prevent recursive extension vector allocation */ -+ gfp |= __GFP_NO_OBJ_EXT; -+ vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, -+ slab_nid(slab)); -+ if (!vec) { -+ /* Mark vectors which failed to allocate */ -+ if (new_slab) -+ mark_failed_objexts_alloc(slab); -+ -+ return -ENOMEM; -+ } -+ -+ new_exts = (unsigned long)vec; -+#ifdef CONFIG_MEMCG -+ new_exts |= MEMCG_DATA_OBJEXTS; -+#endif -+ old_exts = slab->obj_exts; -+ handle_failed_objexts_alloc(old_exts, vec, objects); -+ if (new_slab) { -+ /* -+ * If the slab is brand new and nobody can yet access its -+ * obj_exts, no synchronization is required and obj_exts can -+ * be simply assigned. -+ */ -+ slab->obj_exts = new_exts; -+ } else if (cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) { -+ /* -+ * If the slab is already in use, somebody can allocate and -+ * assign slabobj_exts in parallel. In this case the existing -+ * objcg vector should be reused. -+ */ -+ mark_objexts_empty(vec); -+ kfree(vec); -+ return 0; -+ } -+ -+ kmemleak_not_leak(vec); -+ return 0; -+} -+#endif /* CONFIG_SLAB_OBJ_EXT */ -+ - static struct kmem_cache *create_cache(const char *name, - unsigned int object_size, unsigned int align, - slab_flags_t flags, unsigned int useroffset, -@@ -968,24 +1027,24 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller - return ret; - } - --void *__kmalloc_node(size_t size, gfp_t flags, int node) -+void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) - { - return __do_kmalloc_node(size, flags, node, _RET_IP_); - } --EXPORT_SYMBOL(__kmalloc_node); -+EXPORT_SYMBOL(__kmalloc_node_noprof); - --void *__kmalloc(size_t size, gfp_t flags) -+void *__kmalloc_noprof(size_t size, gfp_t flags) - { - return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); - } --EXPORT_SYMBOL(__kmalloc); -+EXPORT_SYMBOL(__kmalloc_noprof); - --void *__kmalloc_node_track_caller(size_t size, gfp_t flags, -- int node, unsigned long caller) -+void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, -+ int node, unsigned long caller) - { - return __do_kmalloc_node(size, flags, node, caller); - } --EXPORT_SYMBOL(__kmalloc_node_track_caller); -+EXPORT_SYMBOL(kmalloc_node_track_caller_noprof); - - /** - * kfree - free previously allocated memory -@@ -1052,7 +1111,7 @@ size_t __ksize(const void *object) - return slab_ksize(folio_slab(folio)->slab_cache); - } - --void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) -+void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) - { - void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, - size, _RET_IP_); -@@ -1062,9 +1121,9 @@ void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; - } --EXPORT_SYMBOL(kmalloc_trace); -+EXPORT_SYMBOL(kmalloc_trace_noprof); - --void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, -+void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) - { - void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_); -@@ -1074,7 +1133,7 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - ret = kasan_kmalloc(s, ret, size, gfpflags); - return ret; - } --EXPORT_SYMBOL(kmalloc_node_trace); -+EXPORT_SYMBOL(kmalloc_node_trace_noprof); - - gfp_t kmalloc_fix_flags(gfp_t flags) - { -@@ -1104,7 +1163,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) - flags = kmalloc_fix_flags(flags); - - flags |= __GFP_COMP; -- page = alloc_pages_node(node, flags, order); -+ page = alloc_pages_node_noprof(node, flags, order); - if (page) { - ptr = page_address(page); - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -@@ -1119,7 +1178,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) - return ptr; - } - --void *kmalloc_large(size_t size, gfp_t flags) -+void *kmalloc_large_noprof(size_t size, gfp_t flags) - { - void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); - -@@ -1127,9 +1186,9 @@ void *kmalloc_large(size_t size, gfp_t flags) - flags, NUMA_NO_NODE); - return ret; - } --EXPORT_SYMBOL(kmalloc_large); -+EXPORT_SYMBOL(kmalloc_large_noprof); - --void *kmalloc_large_node(size_t size, gfp_t flags, int node) -+void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) - { - void *ret = __kmalloc_large_node(size, flags, node); - -@@ -1137,7 +1196,7 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) - flags, node); - return ret; - } --EXPORT_SYMBOL(kmalloc_large_node); -+EXPORT_SYMBOL(kmalloc_large_node_noprof); - - #ifdef CONFIG_SLAB_FREELIST_RANDOM - /* Randomize a generic freelist */ -@@ -1259,10 +1318,15 @@ static int slab_show(struct seq_file *m, void *p) +@@ -1273,10 +1274,15 @@ static int slab_show(struct seq_file *m, void *p) return 0; } @@ -102534,7 +98648,7 @@ index 607249785..5b204e16f 100644 /* * Here acquiring slab_mutex is risky since we don't prefer to get -@@ -1272,24 +1336,52 @@ void dump_unreclaimable_slab(void) +@@ -1286,24 +1292,52 @@ void dump_unreclaimable_slab(void) * without acquiring the mutex. */ if (!mutex_trylock(&slab_mutex)) { @@ -102582,7 +98696,7 @@ index 607249785..5b204e16f 100644 + } + + slabs_by_mem[i] = n; - } ++ } + + for (i = nr - 1; i >= 0; --i) { + seq_buf_printf(out, "%-17s total: ", slabs_by_mem[i].s->name); @@ -102590,567 +98704,24 @@ index 607249785..5b204e16f 100644 + seq_buf_printf(out, " active: "); + seq_buf_human_readable_u64(out, slabs_by_mem[i].active); + seq_buf_putc(out, '\n'); -+ } + } + mutex_unlock(&slab_mutex); } -@@ -1356,7 +1448,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) - return (void *)p; - } - -- ret = kmalloc_track_caller(new_size, flags); -+ ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); - if (ret && p) { - /* Disable KASAN checks as the object's redzone is accessed. */ - kasan_disable_current(); -@@ -1380,7 +1472,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) - * - * Return: pointer to the allocated memory or %NULL in case of error - */ --void *krealloc(const void *p, size_t new_size, gfp_t flags) -+void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) - { - void *ret; - -@@ -1395,7 +1487,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) - - return ret; - } --EXPORT_SYMBOL(krealloc); -+EXPORT_SYMBOL(krealloc_noprof); - - /** - * kfree_sensitive - Clear sensitive information in memory before freeing -diff --git a/mm/slub.c b/mm/slub.c -index c87628cd8..768b0e292 100644 ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -1781,7 +1781,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, - return kasan_slab_free(s, x, init); - } - --static inline bool slab_free_freelist_hook(struct kmem_cache *s, -+static __always_inline bool slab_free_freelist_hook(struct kmem_cache *s, - void **head, void **tail, - int *cnt) - { -@@ -3470,18 +3470,18 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, - return ret; - } - --void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) -+void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags) - { - return __kmem_cache_alloc_lru(s, NULL, gfpflags); - } --EXPORT_SYMBOL(kmem_cache_alloc); -+EXPORT_SYMBOL(kmem_cache_alloc_noprof); - --void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, -+void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, - gfp_t gfpflags) - { - return __kmem_cache_alloc_lru(s, lru, gfpflags); - } --EXPORT_SYMBOL(kmem_cache_alloc_lru); -+EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); - - void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t orig_size, -@@ -3491,7 +3491,7 @@ void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, - caller, orig_size); - } - --void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) -+void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node) - { - void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); - -@@ -3499,7 +3499,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) - - return ret; - } --EXPORT_SYMBOL(kmem_cache_alloc_node); -+EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); - - static noinline void free_to_partial_list( - struct kmem_cache *s, struct slab *slab, -@@ -3779,6 +3779,7 @@ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, - unsigned long addr) - { - memcg_slab_free_hook(s, slab, p, cnt); -+ alloc_tagging_slab_free_hook(s, slab, p, cnt); - /* - * With KASAN enabled slab_free_freelist_hook modifies the freelist - * to remove objects, whose reuse must be delayed. -@@ -4009,8 +4010,8 @@ static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, - #endif /* CONFIG_SLUB_TINY */ - - /* Note that interrupts must be enabled when calling this function. */ --int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, -- void **p) -+int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, -+ void **p) - { - int i; - struct obj_cgroup *objcg = NULL; -@@ -4034,7 +4035,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - slab_want_init_on_alloc(flags, s), s->object_size); - return i; - } --EXPORT_SYMBOL(kmem_cache_alloc_bulk); -+EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); - - - /* -@@ -5020,7 +5021,8 @@ void __init kmem_cache_init(void) - node_set(node, slab_nodes); - - create_boot_cache(kmem_cache_node, "kmem_cache_node", -- sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0); -+ sizeof(struct kmem_cache_node), -+ SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); - - hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); - -@@ -5030,7 +5032,7 @@ void __init kmem_cache_init(void) - create_boot_cache(kmem_cache, "kmem_cache", - offsetof(struct kmem_cache, node) + - nr_node_ids * sizeof(struct kmem_cache_node *), -- SLAB_HWCACHE_ALIGN, 0, 0); -+ SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); - - kmem_cache = bootstrap(&boot_kmem_cache); - kmem_cache_node = bootstrap(&boot_kmem_cache_node); -diff --git a/mm/util.c b/mm/util.c -index dd12b9531..9d24b8870 100644 ---- a/mm/util.c -+++ b/mm/util.c -@@ -115,7 +115,7 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp) - EXPORT_SYMBOL(kstrndup); - - /** -- * kmemdup - duplicate region of memory -+ * kmemdup_noprof - duplicate region of memory - * - * @src: memory region to duplicate - * @len: memory region length -@@ -124,16 +124,16 @@ EXPORT_SYMBOL(kstrndup); - * Return: newly allocated copy of @src or %NULL in case of error, - * result is physically contiguous. Use kfree() to free. - */ --void *kmemdup(const void *src, size_t len, gfp_t gfp) -+void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) - { - void *p; - -- p = kmalloc_track_caller(len, gfp); -+ p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_); - if (p) - memcpy(p, src, len); - return p; - } --EXPORT_SYMBOL(kmemdup); -+EXPORT_SYMBOL(kmemdup_noprof); - - /** - * kvmemdup - duplicate region of memory -@@ -564,7 +564,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, - EXPORT_SYMBOL(vm_mmap); - - /** -- * kvmalloc_node - attempt to allocate physically contiguous memory, but upon -+ * kvmalloc_node_noprof - attempt to allocate physically contiguous memory, but upon - * failure, fall back to non-contiguous (vmalloc) allocation. - * @size: size of the request. - * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. -@@ -579,7 +579,7 @@ EXPORT_SYMBOL(vm_mmap); - * - * Return: pointer to the allocated memory of %NULL in case of failure - */ --void *kvmalloc_node(size_t size, gfp_t flags, int node) -+void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) - { - gfp_t kmalloc_flags = flags; - void *ret; -@@ -601,7 +601,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) - kmalloc_flags &= ~__GFP_NOFAIL; - } - -- ret = kmalloc_node(size, kmalloc_flags, node); -+ ret = kmalloc_node_noprof(size, kmalloc_flags, node); - - /* - * It doesn't really make sense to fallback to vmalloc for sub page -@@ -626,11 +626,11 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) - * about the resulting pointer, and cannot play - * protection games. - */ -- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, -+ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, - flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, - node, __builtin_return_address(0)); - } --EXPORT_SYMBOL(kvmalloc_node); -+EXPORT_SYMBOL(kvmalloc_node_noprof); - - /** - * kvfree() - Free memory. -@@ -669,7 +669,7 @@ void kvfree_sensitive(const void *addr, size_t len) - } - EXPORT_SYMBOL(kvfree_sensitive); - --void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) -+void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) - { - void *newp; - -@@ -682,15 +682,15 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) - kvfree(p); - return newp; - } --EXPORT_SYMBOL(kvrealloc); -+EXPORT_SYMBOL(kvrealloc_noprof); - - /** -- * __vmalloc_array - allocate memory for a virtually contiguous array. -+ * __vmalloc_array_noprof - allocate memory for a virtually contiguous array. - * @n: number of elements. - * @size: element size. - * @flags: the type of memory to allocate (see kmalloc). - */ --void *__vmalloc_array(size_t n, size_t size, gfp_t flags) -+void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) - { - size_t bytes; - -@@ -698,18 +698,18 @@ void *__vmalloc_array(size_t n, size_t size, gfp_t flags) - return NULL; - return __vmalloc(bytes, flags); - } --EXPORT_SYMBOL(__vmalloc_array); -+EXPORT_SYMBOL(__vmalloc_array_noprof); - - /** -- * vmalloc_array - allocate memory for a virtually contiguous array. -+ * vmalloc_array_noprof - allocate memory for a virtually contiguous array. - * @n: number of elements. - * @size: element size. - */ --void *vmalloc_array(size_t n, size_t size) -+void *vmalloc_array_noprof(size_t n, size_t size) - { - return __vmalloc_array(n, size, GFP_KERNEL); - } --EXPORT_SYMBOL(vmalloc_array); -+EXPORT_SYMBOL(vmalloc_array_noprof); - - /** - * __vcalloc - allocate and zero memory for a virtually contiguous array. -@@ -717,22 +717,22 @@ EXPORT_SYMBOL(vmalloc_array); - * @size: element size. - * @flags: the type of memory to allocate (see kmalloc). - */ --void *__vcalloc(size_t n, size_t size, gfp_t flags) -+void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) - { - return __vmalloc_array(n, size, flags | __GFP_ZERO); - } --EXPORT_SYMBOL(__vcalloc); -+EXPORT_SYMBOL(__vcalloc_noprof); - - /** -- * vcalloc - allocate and zero memory for a virtually contiguous array. -+ * vcalloc_noprof - allocate and zero memory for a virtually contiguous array. - * @n: number of elements. - * @size: element size. - */ --void *vcalloc(size_t n, size_t size) -+void *vcalloc_noprof(size_t n, size_t size) - { - return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO); - } --EXPORT_SYMBOL(vcalloc); -+EXPORT_SYMBOL(vcalloc_noprof); - - /* Neutral page->mapping pointer to address_space or anon_vma or other */ - void *page_rmapping(struct page *page) -diff --git a/mm/vmalloc.c b/mm/vmalloc.c -index 1d13d7168..4c199cf9b 100644 ---- a/mm/vmalloc.c -+++ b/mm/vmalloc.c -@@ -2971,12 +2971,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid, - * but mempolicy wants to alloc memory by interleaving. - */ - if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) -- nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, -+ nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp, - nr_pages_request, - pages + nr_allocated); - - else -- nr = alloc_pages_bulk_array_node(bulk_gfp, nid, -+ nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid, - nr_pages_request, - pages + nr_allocated); - -@@ -3006,9 +3006,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid, - break; - - if (nid == NUMA_NO_NODE) -- page = alloc_pages(alloc_gfp, order); -+ page = alloc_pages_noprof(alloc_gfp, order); - else -- page = alloc_pages_node(nid, alloc_gfp, order); -+ page = alloc_pages_node_noprof(nid, alloc_gfp, order); - if (unlikely(!page)) { - if (!nofail) - break; -@@ -3065,10 +3065,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - - /* Please note that the recursion is strictly bounded. */ - if (array_size > PAGE_SIZE) { -- area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, -+ area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node, - area->caller); - } else { -- area->pages = kmalloc_node(array_size, nested_gfp, node); -+ area->pages = kmalloc_node_noprof(array_size, nested_gfp, node); - } - - if (!area->pages) { -@@ -3151,7 +3151,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - } - - /** -- * __vmalloc_node_range - allocate virtually contiguous memory -+ * __vmalloc_node_range_noprof - allocate virtually contiguous memory - * @size: allocation size - * @align: desired alignment - * @start: vm area range start -@@ -3178,7 +3178,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - * - * Return: the address of the area or %NULL on failure - */ --void *__vmalloc_node_range(unsigned long size, unsigned long align, -+void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, - unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, unsigned long vm_flags, int node, - const void *caller) -@@ -3307,7 +3307,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, - } - - /** -- * __vmalloc_node - allocate virtually contiguous memory -+ * __vmalloc_node_noprof - allocate virtually contiguous memory - * @size: allocation size - * @align: desired alignment - * @gfp_mask: flags for the page level allocator -@@ -3325,10 +3325,10 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, - * - * Return: pointer to the allocated memory or %NULL on error - */ --void *__vmalloc_node(unsigned long size, unsigned long align, -+void *__vmalloc_node_noprof(unsigned long size, unsigned long align, - gfp_t gfp_mask, int node, const void *caller) - { -- return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, -+ return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, - gfp_mask, PAGE_KERNEL, 0, node, caller); - } - /* -@@ -3337,15 +3337,15 @@ void *__vmalloc_node(unsigned long size, unsigned long align, - * than that. - */ - #ifdef CONFIG_TEST_VMALLOC_MODULE --EXPORT_SYMBOL_GPL(__vmalloc_node); -+EXPORT_SYMBOL_GPL(__vmalloc_node_noprof); - #endif - --void *__vmalloc(unsigned long size, gfp_t gfp_mask) -+void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) - { -- return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, -+ return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE, - __builtin_return_address(0)); - } --EXPORT_SYMBOL(__vmalloc); -+EXPORT_SYMBOL(__vmalloc_noprof); - - /** - * vmalloc - allocate virtually contiguous memory -@@ -3359,12 +3359,12 @@ EXPORT_SYMBOL(__vmalloc); - * - * Return: pointer to the allocated memory or %NULL on error - */ --void *vmalloc(unsigned long size) -+void *vmalloc_noprof(unsigned long size) - { -- return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, -+ return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE, - __builtin_return_address(0)); - } --EXPORT_SYMBOL(vmalloc); -+EXPORT_SYMBOL(vmalloc_noprof); - - /** - * vmalloc_huge - allocate virtually contiguous memory, allow huge pages -@@ -3378,16 +3378,16 @@ EXPORT_SYMBOL(vmalloc); - * - * Return: pointer to the allocated memory or %NULL on error - */ --void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) -+void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) - { -- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, -+ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, - gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, - NUMA_NO_NODE, __builtin_return_address(0)); - } --EXPORT_SYMBOL_GPL(vmalloc_huge); -+EXPORT_SYMBOL_GPL(vmalloc_huge_noprof); - - /** -- * vzalloc - allocate virtually contiguous memory with zero fill -+ * vzalloc_noprof - allocate virtually contiguous memory with zero fill - * @size: allocation size - * - * Allocate enough pages to cover @size from the page level -@@ -3399,12 +3399,12 @@ EXPORT_SYMBOL_GPL(vmalloc_huge); - * - * Return: pointer to the allocated memory or %NULL on error - */ --void *vzalloc(unsigned long size) -+void *vzalloc_noprof(unsigned long size) - { -- return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, -+ return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, - __builtin_return_address(0)); - } --EXPORT_SYMBOL(vzalloc); -+EXPORT_SYMBOL(vzalloc_noprof); - - /** - * vmalloc_user - allocate zeroed virtually contiguous memory for userspace -@@ -3415,17 +3415,17 @@ EXPORT_SYMBOL(vzalloc); - * - * Return: pointer to the allocated memory or %NULL on error - */ --void *vmalloc_user(unsigned long size) -+void *vmalloc_user_noprof(unsigned long size) - { -- return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, -+ return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, - GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, - VM_USERMAP, NUMA_NO_NODE, - __builtin_return_address(0)); - } --EXPORT_SYMBOL(vmalloc_user); -+EXPORT_SYMBOL(vmalloc_user_noprof); - - /** -- * vmalloc_node - allocate memory on a specific node -+ * vmalloc_node_noprof - allocate memory on a specific node - * @size: allocation size - * @node: numa node - * -@@ -3437,15 +3437,15 @@ EXPORT_SYMBOL(vmalloc_user); - * - * Return: pointer to the allocated memory or %NULL on error - */ --void *vmalloc_node(unsigned long size, int node) -+void *vmalloc_node_noprof(unsigned long size, int node) - { -- return __vmalloc_node(size, 1, GFP_KERNEL, node, -+ return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node, - __builtin_return_address(0)); - } --EXPORT_SYMBOL(vmalloc_node); -+EXPORT_SYMBOL(vmalloc_node_noprof); - - /** -- * vzalloc_node - allocate memory on a specific node with zero fill -+ * vzalloc_node_noprof - allocate memory on a specific node with zero fill - * @size: allocation size - * @node: numa node - * -@@ -3455,12 +3455,12 @@ EXPORT_SYMBOL(vmalloc_node); - * - * Return: pointer to the allocated memory or %NULL on error - */ --void *vzalloc_node(unsigned long size, int node) -+void *vzalloc_node_noprof(unsigned long size, int node) - { -- return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, -+ return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node, - __builtin_return_address(0)); - } --EXPORT_SYMBOL(vzalloc_node); -+EXPORT_SYMBOL(vzalloc_node_noprof); - - #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) - #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) -@@ -3475,7 +3475,7 @@ EXPORT_SYMBOL(vzalloc_node); - #endif - - /** -- * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) -+ * vmalloc_32_noprof - allocate virtually contiguous memory (32bit addressable) - * @size: allocation size - * - * Allocate enough 32bit PA addressable pages to cover @size from the -@@ -3483,15 +3483,15 @@ EXPORT_SYMBOL(vzalloc_node); - * - * Return: pointer to the allocated memory or %NULL on error - */ --void *vmalloc_32(unsigned long size) -+void *vmalloc_32_noprof(unsigned long size) - { -- return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, -+ return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, - __builtin_return_address(0)); - } --EXPORT_SYMBOL(vmalloc_32); -+EXPORT_SYMBOL(vmalloc_32_noprof); - - /** -- * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory -+ * vmalloc_32_user_noprof - allocate zeroed virtually contiguous 32bit memory - * @size: allocation size - * - * The resulting memory area is 32bit addressable and zeroed so it can be -@@ -3499,14 +3499,14 @@ EXPORT_SYMBOL(vmalloc_32); - * - * Return: pointer to the allocated memory or %NULL on error - */ --void *vmalloc_32_user(unsigned long size) -+void *vmalloc_32_user_noprof(unsigned long size) - { -- return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, -+ return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, - GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, - VM_USERMAP, NUMA_NO_NODE, - __builtin_return_address(0)); - } --EXPORT_SYMBOL(vmalloc_32_user); -+EXPORT_SYMBOL(vmalloc_32_user_noprof); - - /* - * Atomically zero bytes in the iterator. diff --git a/mm/vmscan.c b/mm/vmscan.c -index d6802821d..a22f36ec7 100644 +index 445ce9324..19067fa9a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c -@@ -58,6 +58,7 @@ +@@ -57,6 +57,7 @@ + #include #include #include - #include +#include #include #include -@@ -698,7 +699,6 @@ static int __prealloc_shrinker(struct shrinker *shrinker) +@@ -702,7 +703,6 @@ static int __prealloc_shrinker(struct shrinker *shrinker) return 0; } @@ -103158,7 +98729,7 @@ index d6802821d..a22f36ec7 100644 int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) { va_list ap; -@@ -718,19 +718,12 @@ int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +@@ -722,19 +722,12 @@ int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) return err; } @@ -103179,7 +98750,7 @@ index d6802821d..a22f36ec7 100644 if (shrinker->flags & SHRINKER_MEMCG_AWARE) { down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); -@@ -761,7 +754,6 @@ static int __register_shrinker(struct shrinker *shrinker) +@@ -765,7 +758,6 @@ static int __register_shrinker(struct shrinker *shrinker) return 0; } @@ -103187,7 +98758,7 @@ index d6802821d..a22f36ec7 100644 int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) { va_list ap; -@@ -780,12 +772,6 @@ int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +@@ -784,12 +776,6 @@ int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) } return err; } @@ -103200,7 +98771,7 @@ index d6802821d..a22f36ec7 100644 EXPORT_SYMBOL(register_shrinker); /* -@@ -811,6 +797,9 @@ void unregister_shrinker(struct shrinker *shrinker) +@@ -815,6 +801,9 @@ void unregister_shrinker(struct shrinker *shrinker) kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; @@ -103210,7 +98781,7 @@ index d6802821d..a22f36ec7 100644 } EXPORT_SYMBOL(unregister_shrinker); -@@ -829,6 +818,80 @@ void synchronize_shrinkers(void) +@@ -833,6 +822,80 @@ void synchronize_shrinkers(void) } EXPORT_SYMBOL(synchronize_shrinkers); @@ -103291,7 +98862,7 @@ index d6802821d..a22f36ec7 100644 #define SHRINK_BATCH 128 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, -@@ -895,12 +958,16 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, +@@ -899,12 +962,16 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, unsigned long ret; unsigned long nr_to_scan = min(batch_size, total_scan); @@ -103327,7 +98898,7 @@ index 7778cc97a..5341736f2 100644 +# eval_vars(X_,a/b/c) = $(X_a_b_c) $(X_a_b) $(X_a) +eval_vars = $(foreach var,$(call flatten_dirs,$(2)),$($(1)$(var))) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib -index 100a386fc..1f106c71e 100644 +index 68d0134bd..48ded392d 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -148,7 +148,7 @@ _cpp_flags = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(target-stem).lds) @@ -103340,10 +98911,10 @@ index 100a386fc..1f106c71e 100644 endif diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c -index 0d2db4117..7b7dbeb5b 100644 +index 653b92f6d..47978efe4 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c -@@ -203,6 +203,11 @@ static int symbol_in_range(const struct sym_entry *s, +@@ -204,6 +204,11 @@ static int symbol_in_range(const struct sym_entry *s, return 0; } @@ -103355,7 +98926,7 @@ index 0d2db4117..7b7dbeb5b 100644 static int symbol_valid(const struct sym_entry *s) { const char *name = sym_name(s); -@@ -210,6 +215,14 @@ static int symbol_valid(const struct sym_entry *s) +@@ -211,6 +216,14 @@ static int symbol_valid(const struct sym_entry *s) /* if --all-symbols is not specified, then symbols outside the text * and inittext sections are discarded */ if (!all_symbols) { @@ -103370,36 +98941,5 @@ index 0d2db4117..7b7dbeb5b 100644 if (symbol_in_range(s, text_ranges, ARRAY_SIZE(text_ranges)) == 0) return 0; -diff --git a/scripts/module.lds.S b/scripts/module.lds.S -index bf5bcf283..45c67a099 100644 ---- a/scripts/module.lds.S -+++ b/scripts/module.lds.S -@@ -9,6 +9,8 @@ - #define DISCARD_EH_FRAME *(.eh_frame) - #endif - -+#include -+ - SECTIONS { - /DISCARD/ : { - *(.discard) -@@ -47,12 +49,17 @@ SECTIONS { - .data : { - *(.data .data.[0-9a-zA-Z_]*) - *(.data..L*) -+ CODETAG_SECTIONS() - } - - .rodata : { - *(.rodata .rodata.[0-9a-zA-Z_]*) - *(.rodata..L*) - } -+#else -+ .data : { -+ CODETAG_SECTIONS() -+ } - #endif - } - -- -2.41.0.159.g0bfa463d37 +2.42.0 diff --git a/scripts/source.sh b/scripts/source.sh index 336ce64..ec6b297 100755 --- a/scripts/source.sh +++ b/scripts/source.sh @@ -2,7 +2,7 @@ echo "Pika Kernel - Getting source" -wget -nv https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/snapshot/linux-6.5-rc7.tar.gz -tar -xf ./linux-6.5-rc7.tar.gz +wget -nv https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.5.tar.gz +tar -xf ./linux-6.5.tar.gz -cd linux-6.5-rc7 +cd linux-6.5